In [1]:
import numpy as np
import pandas as pd
import json
import os

In [2]:
drugs = pd.read_csv('gitter_dataset.csv')
drugs

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,IsomericSMILES,MolecularWeight,XLogP,drug_class,desalted_SMILES
0,24769,2,0,CN(C)CCCCCCN(C)C.C(CBr)CBr,374.205,,hematologic,BrCCCBr.CN(C)CCCCCCN(C)C
1,134694070,9,6,C1CN=C(N1)NC2=C(C3=NC=CN=C3C=C2)Br.[C@@H](C(C(...,442.226,,cardio,Brc1c(NC2=NCCN2)ccc2nccnc12
2,5121,2,0,C1CSC2=NC(CN21)C3=CC=C(C=C3)Br,283.187,2.5,antiinfective,Brc1ccc(C2CN3CCSC3=N2)cc1
3,4660557,1,1,C1C2CC3CC1CC(C2)C3NC4=CC=C(C=C4)Br,306.247,5.0,cns,Brc1ccc(NC2C3CC4CC(C3)CC2C4)cc1
4,122175,2,2,CC(CCC(C#C)N)N,126.203,-0.4,antineoplastic,C#CC(N)CCC(C)N
...,...,...,...,...,...,...,...,...
6930,9231,0,0,C1=CC=C2C=CC=C2C=C1,128.174,3.2,antiinflammatory,c1ccc2cccc-2cc1
6931,3038520,6,1,C1CN(CCN1CC2=CC3=C(C=C2)OCO3)C4=NC=CC=N4.Cl,334.804,,cns,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1
6932,88747,4,1,C1CN(CCN1)C2=NC=CC=N2,164.212,0.0,cns,c1cnc(N2CCNCC2)nc1
6933,16640802,2,1,C1CCC(CC1)(C2=CC=CS2)N3CCCCC3.Cl,285.874,,cns,c1csc(C2(N3CCCCC3)CCCCC2)c1


In [3]:
drugs = drugs.drop('desalted_SMILES', axis=1)

In [4]:
drugs = drugs.rename(columns={'XLogP':'LogP'})

In [5]:
# Set lipinski rules
hdonor = drugs['HBondDonorCount'] < 6
haccept = drugs['HBondAcceptorCount'] < 10
mw = drugs['MolecularWeight'] < 500
clogP = drugs['LogP'] < 5
# Apply rules to dataframe
drugs['RuleFive'] = np.where(((hdonor & haccept & mw) | (hdonor & haccept & clogP) | (hdonor & mw & clogP) | (haccept & mw & clogP)), 1, 0)

In [6]:
drugs

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,IsomericSMILES,MolecularWeight,LogP,drug_class,RuleFive
0,24769,2,0,CN(C)CCCCCCN(C)C.C(CBr)CBr,374.205,,hematologic,1
1,134694070,9,6,C1CN=C(N1)NC2=C(C3=NC=CN=C3C=C2)Br.[C@@H](C(C(...,442.226,,cardio,0
2,5121,2,0,C1CSC2=NC(CN21)C3=CC=C(C=C3)Br,283.187,2.5,antiinfective,1
3,4660557,1,1,C1C2CC3CC1CC(C2)C3NC4=CC=C(C=C4)Br,306.247,5.0,cns,1
4,122175,2,2,CC(CCC(C#C)N)N,126.203,-0.4,antineoplastic,1
...,...,...,...,...,...,...,...,...
6930,9231,0,0,C1=CC=C2C=CC=C2C=C1,128.174,3.2,antiinflammatory,1
6931,3038520,6,1,C1CN(CCN1CC2=CC3=C(C=C2)OCO3)C4=NC=CC=N4.Cl,334.804,,cns,1
6932,88747,4,1,C1CN(CCN1)C2=NC=CC=N2,164.212,0.0,cns,1
6933,16640802,2,1,C1CCC(CC1)(C2=CC=CS2)N3CCCCC3.Cl,285.874,,cns,1


In [9]:
with open(os.path.join('..','dataframes_resources','dataframes_props.json')) as file:
    res = json.load(file)
matc_gitter_conversion = res['matc_gitter_conversion']

In [10]:
matc_gitter_conversion

{'hematologic': 'B',
 'cardio': 'C',
 'antiinfective': 'J',
 'cns': 'N',
 'antineoplastic': 'L',
 'reproductivecontrol': 'G',
 'dermatologic': 'D',
 'antiinflammatory': 'I',
 'respiratorysystem': 'R',
 'gastrointestinal': 'A',
 'lipidregulating': 'O',
 'urological': 'G'}

In [12]:
drugs['MATC_Code_Short'] = drugs['drug_class'].map(matc_gitter_conversion)

In [13]:
with open(os.path.join('..','dataframes_resources','dataframes_props.json')) as file:
    res = json.load(file)
matc_codes_explanation = res['matc_codes_explanation']

In [14]:
matc_codes_explanation

{'A': 'ALIMENTARY TRACT AND METABOLISM',
 'B': 'BLOOD AND BLOOD FORMING ORGANS',
 'C': 'CARDIOVASCULAR SYSTEM',
 'D': 'DERMATOLOGICALS',
 'G': 'GENITO URINARY SYSTEM AND SEX HORMONES',
 'H': 'SYSTEMIC HORMONAL PREPARATIONS, EXCL. SEX HORMONES AND INSULINS',
 'J': 'ANTIINFECTIVES FOR SYSTEMIC USE',
 'L': 'ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS',
 'M': 'MUSCULO-SKELETAL SYSTEM',
 'N': 'NERVOUS SYSTEM',
 'P': 'ANTIPARASITIC PRODUCTS, INSECTICIDES AND REPELLENTS',
 'R': 'RESPIRATORY SYSTEM',
 'S': 'SENSORY ORGANS',
 'V': 'VARIOUS',
 'I': 'ANTIINFLAMMATORY',
 'O': 'LIPID REGULATION'}

In [15]:
drugs['MATC_Code_Explanation'] = drugs['MATC_Code_Short'].map(matc_codes_explanation)

In [16]:
drugs

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,IsomericSMILES,MolecularWeight,LogP,drug_class,RuleFive,MATC_Code_Short,MATC_Code_Explanation
0,24769,2,0,CN(C)CCCCCCN(C)C.C(CBr)CBr,374.205,,hematologic,1,B,BLOOD AND BLOOD FORMING ORGANS
1,134694070,9,6,C1CN=C(N1)NC2=C(C3=NC=CN=C3C=C2)Br.[C@@H](C(C(...,442.226,,cardio,0,C,CARDIOVASCULAR SYSTEM
2,5121,2,0,C1CSC2=NC(CN21)C3=CC=C(C=C3)Br,283.187,2.5,antiinfective,1,J,ANTIINFECTIVES FOR SYSTEMIC USE
3,4660557,1,1,C1C2CC3CC1CC(C2)C3NC4=CC=C(C=C4)Br,306.247,5.0,cns,1,N,NERVOUS SYSTEM
4,122175,2,2,CC(CCC(C#C)N)N,126.203,-0.4,antineoplastic,1,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
...,...,...,...,...,...,...,...,...,...,...
6930,9231,0,0,C1=CC=C2C=CC=C2C=C1,128.174,3.2,antiinflammatory,1,I,ANTIINFLAMMATORY
6931,3038520,6,1,C1CN(CCN1CC2=CC3=C(C=C2)OCO3)C4=NC=CC=N4.Cl,334.804,,cns,1,N,NERVOUS SYSTEM
6932,88747,4,1,C1CN(CCN1)C2=NC=CC=N2,164.212,0.0,cns,1,N,NERVOUS SYSTEM
6933,16640802,2,1,C1CCC(CC1)(C2=CC=CS2)N3CCCCC3.Cl,285.874,,cns,1,N,NERVOUS SYSTEM


In [17]:
drugs = drugs.drop('drug_class', axis=1)

In [18]:
drugs.to_csv('gitter_dataset_label_clean.csv', index=False)