In [1]:
import os
import json
import pandas as pd
import pubchempy as pcp

In [2]:
pubchem = pd.read_csv(os.path.join('..', 'pubchem', 'pubchem_dataset_label_clean.csv'))

In [3]:
pubchem.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4375 entries, 0 to 4374
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CID                   4375 non-null   int64  
 1   HBondAcceptorCount    4375 non-null   int64  
 2   HBondDonorCount       4375 non-null   int64  
 3   MolecularWeight       4375 non-null   float64
 4   LogP                  3809 non-null   float64
 5   RuleFive              4375 non-null   int64  
 6   IsomericSMILES        4375 non-null   object 
 7   ATC_Code              4375 non-null   object 
 8   ATC_Code_Short        4375 non-null   object 
 9   ATC_Code_Explanation  4375 non-null   object 
dtypes: float64(2), int64(4), object(4)
memory usage: 341.9+ KB


In [4]:
drugbank = pd.read_csv('drugbank_dataframe_mapping.csv')

In [5]:
drugbank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12227 entries, 0 to 12226
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   InChIKey            11586 non-null  object 
 1   HBondAcceptorCount  11580 non-null  float64
 2   HBondDonorCount     11580 non-null  float64
 3   MolecularWeight     11586 non-null  float64
 4   LogP                11570 non-null  float64
 5   RuleFive            11587 non-null  float64
 6   IsomericSMILES      11583 non-null  object 
 7   ATC_Code            3024 non-null   object 
dtypes: float64(5), object(3)
memory usage: 764.3+ KB


seleccionamos los compuestos del drugbank que tienen un atc

In [6]:
drugbank_atc = drugbank[drugbank['ATC_Code'].isna() == False]
drugbank_atc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3024 entries, 0 to 12147
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   InChIKey            2940 non-null   object 
 1   HBondAcceptorCount  2940 non-null   float64
 2   HBondDonorCount     2940 non-null   float64
 3   MolecularWeight     2940 non-null   float64
 4   LogP                2936 non-null   float64
 5   RuleFive            2941 non-null   float64
 6   IsomericSMILES      2938 non-null   object 
 7   ATC_Code            3024 non-null   object 
dtypes: float64(5), object(3)
memory usage: 212.6+ KB


eliminamos compuestos que no tienen SMILES, en su gran mayoría son polímeros sin estructura sencilla

In [7]:
drugbank_atc_clean = drugbank_atc[drugbank_atc['IsomericSMILES'].isna()==False]


In [8]:
drugbank_atc_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2938 entries, 0 to 12147
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   InChIKey            2938 non-null   object 
 1   HBondAcceptorCount  2937 non-null   float64
 2   HBondDonorCount     2937 non-null   float64
 3   MolecularWeight     2938 non-null   float64
 4   LogP                2934 non-null   float64
 5   RuleFive            2938 non-null   float64
 6   IsomericSMILES      2938 non-null   object 
 7   ATC_Code            2938 non-null   object 
dtypes: float64(5), object(3)
memory usage: 206.6+ KB


Eliminamos los compuestos que ya están presentes en pubchem gracias a que el código ATC es único

In [9]:
drugbank_atc_new = drugbank_atc_clean[drugbank_atc_clean['ATC_Code'].isin(pubchem['ATC_Code']) == False]
drugbank_atc_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 679 entries, 1 to 12147
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   InChIKey            679 non-null    object 
 1   HBondAcceptorCount  679 non-null    float64
 2   HBondDonorCount     679 non-null    float64
 3   MolecularWeight     679 non-null    float64
 4   LogP                678 non-null    float64
 5   RuleFive            679 non-null    float64
 6   IsomericSMILES      679 non-null    object 
 7   ATC_Code            679 non-null    object 
dtypes: float64(5), object(3)
memory usage: 47.7+ KB


Estos compuestos tienen un atc code que no está presente en pubchem. Analizando los datos he visto que varias moléculas muestran códigos ATC distintos en una y otra biblioteca. Para determinar si son duplicados con distinto atc o moléculas no presentes en pubchem, obtengo el CID de estos compuestos a partir de su smiles.

In [10]:
def get_cid_from_smiles(smiles):
    try:
        mol = pcp.get_compounds(smiles, 'smiles')
        return int(mol[0].cid)
    except:
        print(f'no cid for smiles {smiles}')
        return None

In [11]:
sample = drugbank_atc_new.iloc[:5]
sample['CID'] = sample['IsomericSMILES'].map(get_cid_from_smiles)
sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 1 to 21
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   InChIKey            5 non-null      object 
 1   HBondAcceptorCount  5 non-null      float64
 2   HBondDonorCount     5 non-null      float64
 3   MolecularWeight     5 non-null      float64
 4   LogP                5 non-null      float64
 5   RuleFive            5 non-null      float64
 6   IsomericSMILES      5 non-null      object 
 7   ATC_Code            5 non-null      object 
 8   CID                 5 non-null      int64  
dtypes: float64(5), int64(1), object(3)
memory usage: 400.0+ bytes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['CID'] = sample['IsomericSMILES'].map(get_cid_from_smiles)


Creo una columna cid en el dataframe de drugbank. Este es un paso que lleva mucho tiempo, por eso espero a tener el dataset más pequeño antes de hacerlo.

In [392]:
#drugbank_atc_new['CID'] = drugbank_atc_new['IsomericSMILES'].map(get_cid_from_smiles)

no cid for smiles [H][C@]12C[C@]1(NC(=O)[C@]1([H])C[C@H](C[C@@]1([H])C(=O)N(C)CCCC\C=C/2)OC1=CC(=NC2=C1C=CC(OC)=C2C)C1=NC(=CS1)C(C)C)C(=O)NS(=O)(=O)C1CC1
no cid for smiles [Mg++].[O-][O-]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_atc_new['CID'] = drugbank_atc_new['IsomericSMILES'].map(get_cid_from_smiles)


In [425]:
#drugbank_atc_new.to_csv('new_atc_codes_from_drugbank.csv', index=False)

In [12]:
drugbank_atc_new = pd.read_csv('new_atc_codes_from_drugbank.csv')

In [13]:
drugbank_atc_new

Unnamed: 0,InChIKey,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,RuleFive,IsomericSMILES,ATC_Code,CID
0,GFIJNRVAKGFPGQ-LIJARHBVSA-N,16.0,16.0,1209.3983,-2.40,0.0,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,L02AE51,657181
1,PMATZTZNYRCHOR-CGLBZJNRSA-N,12.0,5.0,1202.6350,3.64,0.0,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\C=C\C)N...,L04AD01,5284373
2,KDXKERNSBIXSRK-YFKPBYRVSA-N,4.0,3.0,146.1876,-3.20,1.0,NCCCC[C@H](N)C(O)=O,V03AF11,5962
3,ODKSFYDXXFIFQN-BYPYZUCNSA-N,6.0,5.0,174.2010,-3.20,1.0,N[C@@H](CCCNC(N)=N)C(O)=O,V03AF11,6322
4,CIWBSHSKHKDKBQ-JLAZNSOCSA-N,5.0,4.0,176.1241,-1.90,1.0,[H][C@@]1(OC(=O)C(O)=C1O)[C@@H](O)CO,G01AD03,54670067
...,...,...,...,...,...,...,...,...,...
671,XAYGBKHKBBXDAK-UHFFFAOYSA-N,5.0,1.0,450.5600,1.62,1.0,O=C(N1CCN(CC2CC2)CC1)C1=CC=C(NS(=O)(=O)C2=CC=C...,G01AE10,59634741
672,VEVMYTDOWUQLGI-UHFFFAOYSA-N,4.0,3.0,409.9600,5.22,0.0,CC(C)(C)NCC1=CC(NC2=CC=NC3=CC(Cl)=CC=C23)=C2CC...,P01BF08,9851775
673,GBECUEIQVRDUKB-RYDPDVNUSA-M,0.0,0.0,236.4200,0.53,1.0,Cl[201Tl],V09GX01,16019977
674,WUWFMDMBOJLQIV-UHFFFAOYSA-N,7.0,2.0,404.3490,0.47,1.0,NC1CCN(C1)C1=NC2=C(C=C1F)C(=O)C(=CN2C1=CC=C(F)...,J01MA22,5517


Drop two compounds without CID, one is actualy present in pubchem's dataset and the other is just magnesium peroxide

In [14]:
drugbank_atc_new[drugbank_atc_new['CID'].isna()]

Unnamed: 0,InChIKey,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,RuleFive,IsomericSMILES,ATC_Code,CID


In [15]:
drugbank_atc_new.dropna(inplace=True)

CID are saved as floats instead of integers for some reason, so we need to transform the values to integers

In [16]:
drugbank_atc_new['CID'] = drugbank_atc_new['CID'].astype('int')

We check which CIDs were not present in Pubchems dataset

In [17]:
drugbank_atc_new2 = drugbank_atc_new[drugbank_atc_new['CID'].isin(pubchem['CID'])==False]

In [18]:
drugbank_atc_new2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 178 entries, 3 to 675
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   InChIKey            178 non-null    object 
 1   HBondAcceptorCount  178 non-null    float64
 2   HBondDonorCount     178 non-null    float64
 3   MolecularWeight     178 non-null    float64
 4   LogP                178 non-null    float64
 5   RuleFive            178 non-null    float64
 6   IsomericSMILES      178 non-null    object 
 7   ATC_Code            178 non-null    object 
 8   CID                 178 non-null    int32  
dtypes: float64(5), int32(1), object(3)
memory usage: 13.2+ KB


Remove the inchikey column

In [19]:
drugbank_atc_new2.drop('InChIKey', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugbank_atc_new2.drop('InChIKey', axis=1, inplace=True)


Concatenate pubchem with drugbank

In [20]:
atc_dbs = pd.concat([pubchem, drugbank_atc_new2])

In [21]:
atc_dbs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4553 entries, 0 to 675
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CID                   4553 non-null   int64  
 1   HBondAcceptorCount    4553 non-null   float64
 2   HBondDonorCount       4553 non-null   float64
 3   MolecularWeight       4553 non-null   float64
 4   LogP                  3987 non-null   float64
 5   RuleFive              4553 non-null   float64
 6   IsomericSMILES        4553 non-null   object 
 7   ATC_Code              4553 non-null   object 
 8   ATC_Code_Short        4375 non-null   object 
 9   ATC_Code_Explanation  4375 non-null   object 
dtypes: float64(5), int64(1), object(4)
memory usage: 391.3+ KB


In [22]:
with open(os.path.join('..','dataframes_resources','dataframes_props.json')) as file:
    res = json.load(file)
atc_codes_explanation_dic = res['atc_codes_explanation']

In [23]:
atc_codes_explanation_dic

{'A': 'ALIMENTARY TRACT AND METABOLISM',
 'B': 'BLOOD AND BLOOD FORMING ORGANS',
 'C': 'CARDIOVASCULAR SYSTEM',
 'D': 'DERMATOLOGICALS',
 'G': 'GENITO URINARY SYSTEM AND SEX HORMONES',
 'H': 'SYSTEMIC HORMONAL PREPARATIONS, EXCL. SEX HORMONES AND INSULINS',
 'J': 'ANTIINFECTIVES FOR SYSTEMIC USE',
 'L': 'ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS',
 'M': 'MUSCULO-SKELETAL SYSTEM',
 'N': 'NERVOUS SYSTEM',
 'P': 'ANTIPARASITIC PRODUCTS, INSECTICIDES AND REPELLENTS',
 'R': 'RESPIRATORY SYSTEM',
 'S': 'SENSORY ORGANS',
 'V': 'VARIOUS',
 'I': 'ANTIINFLAMMATORY',
 'O': 'LIPID REGULATION'}

In [24]:
atc_dbs['ATC_Code_Short'] = atc_dbs['ATC_Code'].str[0]

In [25]:
atc_dbs['ATC_Code_Explanation'] = atc_dbs['ATC_Code_Short'].map(atc_codes_explanation_dic)

In [26]:
atc_dbs

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,RuleFive,IsomericSMILES,ATC_Code,ATC_Code_Short,ATC_Code_Explanation
0,1,4.0,0.0,203.240,0.40,1.0,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,N06BX12,N,NERVOUS SYSTEM
1,119,3.0,2.0,103.120,-3.17,1.0,C(CC(=O)O)CN,N03AG03,N,NERVOUS SYSTEM
2,137,4.0,2.0,131.130,-1.50,1.0,C(CC(=O)O)C(=O)CN,L01XD04,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
3,176,2.0,1.0,60.050,-0.17,1.0,CC(=O)O,G01AD02,G,GENITO URINARY SYSTEM AND SEX HORMONES
4,187,2.0,0.0,146.210,0.20,1.0,CC(=O)OCC[N+](C)(C)C,S01EB09,S,SENSORY ORGANS
...,...,...,...,...,...,...,...,...,...,...
669,91800164,19.0,9.0,1011.909,-3.20,0.0,[68Ga+3].OC(=O)CC[C@H](NC(=O)N[C@@H](CCCCNC(=O...,V09IX14,V,VARIOUS
670,11967809,7.0,0.0,774.970,6.05,0.0,[Na+].CC1(C)\C(=C/C=C/C=C/C=C/C2=[N+](CCCCS([O...,V04CX01,V,VARIOUS
672,9851775,4.0,3.0,409.960,5.22,0.0,CC(C)(C)NCC1=CC(NC2=CC=NC3=CC(Cl)=CC=C23)=C2CC...,P01BF08,P,"ANTIPARASITIC PRODUCTS, INSECTICIDES AND REPEL..."
673,16019977,0.0,0.0,236.420,0.53,1.0,Cl[201Tl],V09GX01,V,VARIOUS


In [27]:
atc_dbs = atc_dbs.drop('ATC_Code', axis=1)

In [28]:
atc_dbs

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,RuleFive,IsomericSMILES,ATC_Code_Short,ATC_Code_Explanation
0,1,4.0,0.0,203.240,0.40,1.0,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,N,NERVOUS SYSTEM
1,119,3.0,2.0,103.120,-3.17,1.0,C(CC(=O)O)CN,N,NERVOUS SYSTEM
2,137,4.0,2.0,131.130,-1.50,1.0,C(CC(=O)O)C(=O)CN,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
3,176,2.0,1.0,60.050,-0.17,1.0,CC(=O)O,G,GENITO URINARY SYSTEM AND SEX HORMONES
4,187,2.0,0.0,146.210,0.20,1.0,CC(=O)OCC[N+](C)(C)C,S,SENSORY ORGANS
...,...,...,...,...,...,...,...,...,...
669,91800164,19.0,9.0,1011.909,-3.20,0.0,[68Ga+3].OC(=O)CC[C@H](NC(=O)N[C@@H](CCCCNC(=O...,V,VARIOUS
670,11967809,7.0,0.0,774.970,6.05,0.0,[Na+].CC1(C)\C(=C/C=C/C=C/C=C/C2=[N+](CCCCS([O...,V,VARIOUS
672,9851775,4.0,3.0,409.960,5.22,0.0,CC(C)(C)NCC1=CC(NC2=CC=NC3=CC(Cl)=CC=C23)=C2CC...,P,"ANTIPARASITIC PRODUCTS, INSECTICIDES AND REPEL..."
673,16019977,0.0,0.0,236.420,0.53,1.0,Cl[201Tl],V,VARIOUS


In [29]:
atc_dbs.to_csv('pubchem_drugbank.csv', index=False)

In [31]:
gitter = pd.read_csv(os.path.join('..', 'gitter', 'gitter_dataset_label_clean.csv'))

In [33]:
atc_dbs

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,RuleFive,IsomericSMILES,ATC_Code_Short,ATC_Code_Explanation
0,1,4.0,0.0,203.240,0.40,1.0,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,N,NERVOUS SYSTEM
1,119,3.0,2.0,103.120,-3.17,1.0,C(CC(=O)O)CN,N,NERVOUS SYSTEM
2,137,4.0,2.0,131.130,-1.50,1.0,C(CC(=O)O)C(=O)CN,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
3,176,2.0,1.0,60.050,-0.17,1.0,CC(=O)O,G,GENITO URINARY SYSTEM AND SEX HORMONES
4,187,2.0,0.0,146.210,0.20,1.0,CC(=O)OCC[N+](C)(C)C,S,SENSORY ORGANS
...,...,...,...,...,...,...,...,...,...
669,91800164,19.0,9.0,1011.909,-3.20,0.0,[68Ga+3].OC(=O)CC[C@H](NC(=O)N[C@@H](CCCCNC(=O...,V,VARIOUS
670,11967809,7.0,0.0,774.970,6.05,0.0,[Na+].CC1(C)\C(=C/C=C/C=C/C=C/C2=[N+](CCCCS([O...,V,VARIOUS
672,9851775,4.0,3.0,409.960,5.22,0.0,CC(C)(C)NCC1=CC(NC2=CC=NC3=CC(Cl)=CC=C23)=C2CC...,P,"ANTIPARASITIC PRODUCTS, INSECTICIDES AND REPEL..."
673,16019977,0.0,0.0,236.420,0.53,1.0,Cl[201Tl],V,VARIOUS


In [32]:
gitter

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,IsomericSMILES,MolecularWeight,LogP,RuleFive,ATC_Code_Short,ATC_Code_Explanation
0,24769,2,0,CN(C)CCCCCCN(C)C.C(CBr)CBr,374.205,,1,B,BLOOD AND BLOOD FORMING ORGANS
1,134694070,9,6,C1CN=C(N1)NC2=C(C3=NC=CN=C3C=C2)Br.[C@@H](C(C(...,442.226,,0,C,CARDIOVASCULAR SYSTEM
2,5121,2,0,C1CSC2=NC(CN21)C3=CC=C(C=C3)Br,283.187,2.5,1,J,ANTIINFECTIVES FOR SYSTEMIC USE
3,4660557,1,1,C1C2CC3CC1CC(C2)C3NC4=CC=C(C=C4)Br,306.247,5.0,1,N,NERVOUS SYSTEM
4,122175,2,2,CC(CCC(C#C)N)N,126.203,-0.4,1,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
...,...,...,...,...,...,...,...,...,...
6930,9231,0,0,C1=CC=C2C=CC=C2C=C1,128.174,3.2,1,I,ANTIINFLAMMATORY
6931,3038520,6,1,C1CN(CCN1CC2=CC3=C(C=C2)OCO3)C4=NC=CC=N4.Cl,334.804,,1,N,NERVOUS SYSTEM
6932,88747,4,1,C1CN(CCN1)C2=NC=CC=N2,164.212,0.0,1,N,NERVOUS SYSTEM
6933,16640802,2,1,C1CCC(CC1)(C2=CC=CS2)N3CCCCC3.Cl,285.874,,1,N,NERVOUS SYSTEM


In [38]:
gitter_new = gitter[gitter['CID'].isin(atc_dbs['CID'])==False]

In [39]:
gitter_new

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,IsomericSMILES,MolecularWeight,LogP,RuleFive,ATC_Code_Short,ATC_Code_Explanation
0,24769,2,0,CN(C)CCCCCCN(C)C.C(CBr)CBr,374.205,,1,B,BLOOD AND BLOOD FORMING ORGANS
1,134694070,9,6,C1CN=C(N1)NC2=C(C3=NC=CN=C3C=C2)Br.[C@@H](C(C(...,442.226,,0,C,CARDIOVASCULAR SYSTEM
2,5121,2,0,C1CSC2=NC(CN21)C3=CC=C(C=C3)Br,283.187,2.5,1,J,ANTIINFECTIVES FOR SYSTEMIC USE
3,4660557,1,1,C1C2CC3CC1CC(C2)C3NC4=CC=C(C=C4)Br,306.247,5.0,1,N,NERVOUS SYSTEM
4,122175,2,2,CC(CCC(C#C)N)N,126.203,-0.4,1,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
...,...,...,...,...,...,...,...,...,...
6930,9231,0,0,C1=CC=C2C=CC=C2C=C1,128.174,3.2,1,I,ANTIINFLAMMATORY
6931,3038520,6,1,C1CN(CCN1CC2=CC3=C(C=C2)OCO3)C4=NC=CC=N4.Cl,334.804,,1,N,NERVOUS SYSTEM
6932,88747,4,1,C1CN(CCN1)C2=NC=CC=N2,164.212,0.0,1,N,NERVOUS SYSTEM
6933,16640802,2,1,C1CCC(CC1)(C2=CC=CS2)N3CCCCC3.Cl,285.874,,1,N,NERVOUS SYSTEM


In [40]:
all_drugs = pd.concat([atc_dbs, gitter_new])

In [41]:
all_drugs

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,RuleFive,IsomericSMILES,ATC_Code_Short,ATC_Code_Explanation
0,1,4.0,0.0,203.240,0.40,1.0,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,N,NERVOUS SYSTEM
1,119,3.0,2.0,103.120,-3.17,1.0,C(CC(=O)O)CN,N,NERVOUS SYSTEM
2,137,4.0,2.0,131.130,-1.50,1.0,C(CC(=O)O)C(=O)CN,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
3,176,2.0,1.0,60.050,-0.17,1.0,CC(=O)O,G,GENITO URINARY SYSTEM AND SEX HORMONES
4,187,2.0,0.0,146.210,0.20,1.0,CC(=O)OCC[N+](C)(C)C,S,SENSORY ORGANS
...,...,...,...,...,...,...,...,...,...
6930,9231,0.0,0.0,128.174,3.20,1.0,C1=CC=C2C=CC=C2C=C1,I,ANTIINFLAMMATORY
6931,3038520,6.0,1.0,334.804,,1.0,C1CN(CCN1CC2=CC3=C(C=C2)OCO3)C4=NC=CC=N4.Cl,N,NERVOUS SYSTEM
6932,88747,4.0,1.0,164.212,0.00,1.0,C1CN(CCN1)C2=NC=CC=N2,N,NERVOUS SYSTEM
6933,16640802,2.0,1.0,285.874,,1.0,C1CCC(CC1)(C2=CC=CS2)N3CCCCC3.Cl,N,NERVOUS SYSTEM


In [43]:
all_drugs.to_csv('all_label_drugs.csv', index=False)