In [2]:
import os
import json
import pandas as pd
import pubchempy as pcp

In [79]:
drugs = pd.read_csv('pubchem_atc_2043.csv')
drugs.sort_values('cid')

Unnamed: 0,cid,name,atc_code
0,2081,Alaproclate,N06AB07
1,2082,Albendazole,P02CA03
2,2083,Salbutamol,R03AC02
3,2088,Alendronic acid,M05BA04
4,2091,"5-[2-[7a-methyl-1-(6-methylheptan-2-yl)-2,3,3a...",A11CC03
...,...,...,...
2038,179344,Eslicarbazepine acetate,N03AF04
2039,181458,Memantine hydrochloride,N06DX01
2040,182137,Esketamine,N06AX27
2041,183797,Melagatran,B01AE04


In [80]:
drugs = drugs.drop_duplicates()
drugs

Unnamed: 0,cid,name,atc_code
0,2081,Alaproclate,N06AB07
1,2082,Albendazole,P02CA03
2,2083,Salbutamol,R03AC02
3,2088,Alendronic acid,M05BA04
4,2091,"5-[2-[7a-methyl-1-(6-methylheptan-2-yl)-2,3,3a...",A11CC03
...,...,...,...
2038,179344,Eslicarbazepine acetate,N03AF04
2039,181458,Memantine hydrochloride,N06DX01
2040,182137,Esketamine,N06AX27
2041,183797,Melagatran,B01AE04


In [81]:
drugs['compound'] = drugs['cid'].map(pcp.Compound.from_cid)

In [82]:
drugs[[
    'HBondAcceptorCount',
    'HBondDonorCount',
    'MolecularWeight',
    'LogP',
    'IsomericSMILES']] = drugs['compound'].apply(lambda x: pd.Series([
        x.h_bond_acceptor_count,
        x.h_bond_donor_count, 
        x.molecular_weight,
        x.xlogp,
        x.isomeric_smiles]))

In [83]:
drugs

Unnamed: 0,cid,name,atc_code,compound,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES
0,2081,Alaproclate,N06AB07,Compound(2081),3,1,255.74,2.8,CC(C(=O)OC(C)(C)CC1=CC=C(C=C1)Cl)N
1,2082,Albendazole,P02CA03,Compound(2082),4,2,265.33,2.9,CCCSC1=CC2=C(C=C1)N=C(N2)NC(=O)OC
2,2083,Salbutamol,R03AC02,Compound(2083),4,4,239.31,0.3,CC(C)(C)NCC(C1=CC(=C(C=C1)O)CO)O
3,2088,Alendronic acid,M05BA04,Compound(2088),8,6,249.10,-6.5,C(CC(O)(P(=O)(O)O)P(=O)(O)O)CN
4,2091,"5-[2-[7a-methyl-1-(6-methylheptan-2-yl)-2,3,3a...",A11CC03,Compound(2091),2,2,400.6,6.8,CC(C)CCCC(C)C1CCC2C1(CCCC2=CC=C3CC(CC(C3=C)O)O)C
...,...,...,...,...,...,...,...,...,...
2038,179344,Eslicarbazepine acetate,N03AF04,Compound(179344),3,1,296.32,2.0,CC(=O)O[C@H]1CC2=CC=CC=C2N(C3=CC=CC=C13)C(=O)N
2039,181458,Memantine hydrochloride,N06DX01,Compound(181458),1,2,215.76,,CC12CC3CC(C1)(CC(C3)(C2)N)C.Cl
2040,182137,Esketamine,N06AX27,Compound(182137),2,1,237.72,2.2,CN[C@@]1(CCCCC1=O)C2=CC=CC=C2Cl
2041,183797,Melagatran,B01AE04,Compound(183797),6,5,429.5,-1.0,C1CCC(CC1)[C@H](C(=O)N2CC[C@H]2C(=O)NCC3=CC=C(...


In [84]:
drugs.to_pickle('pubchem_atc_2043.pkl')

In [101]:
drugs_1984 = pd.read_pickle('pubchem_atc_1984.pkl')
drugs.head()

Unnamed: 0,cid,atc_code,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES
0,1,N06BX12,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,137,L01XD04,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN
2,176,G01AD02,2,1,60.05,-0.2,CC(=O)O
3,187,S01EB09,2,0,146.21,0.2,CC(=O)OCC[N+](C)(C)C
4,237,P01AX05,4,1,400.0,6.0,CCN(CC)CCCC(C)NC1=C2C=C(C=CC2=NC3=C1C=CC(=C3)C...


In [103]:
drugs_1984 = drugs_1984.drop(['name', 'compound'], axis=1)

In [104]:
drugs_1984.head()

Unnamed: 0,cid,atc_code,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES
0,1,N06BX12,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,137,L01XD04,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN
2,176,G01AD02,2,1,60.05,-0.2,CC(=O)O
3,187,S01EB09,2,0,146.21,0.2,CC(=O)OCC[N+](C)(C)C
4,237,P01AX05,4,1,400.0,6.0,CCN(CC)CCCC(C)NC1=C2C=C(C=CC2=NC3=C1C=CC(=C3)C...


In [87]:
with open(os.path.join('..', 'dataframes_resources', 'dataframes_props.json')) as props:
    columns = json.load(props)

In [88]:
column_order = columns['column_order_pubchem']
column_order

['CID',
 'HBondAcceptorCount',
 'HBondDonorCount',
 'MolecularWeight',
 'LogP',
 'IsomericSMILES',
 'ATC_Code']

In [105]:
drugs_1984 = drugs_1984.rename(columns = {'atc_code': 'ATC_Code', 'cid': 'CID'})[column_order]

In [106]:
drugs_1984

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,ATC_Code
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,N06BX12
1,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN,L01XD04
2,176,2,1,60.05,-0.2,CC(=O)O,G01AD02
3,187,2,0,146.21,0.2,CC(=O)OCC[N+](C)(C)C,S01EB09
4,237,4,1,400.0,6.0,CCN(CC)CCCC(C)NC1=C2C=C(C=CC2=NC3=C1C=CC(=C3)C...,P01AX05
...,...,...,...,...,...,...,...
1907,120226,21,0,520.08,7.6,C(C(C(C(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F...,C10HF21
1908,120508,4,0,361.5,5.0,C1CCN(CC1)CCC2=NC(=NO2)CC(C3=CC=CC=C3)C4=CC=CC=C4,R05DB18
1981,202223,30,25,1165.2,,C[C@H]1[C@@]([C@H]([C@@H](O1)O[C@@H]2[C@H]([C@...,J01GA02
1982,202225,16,0,814.0,4.3,C[C@@H]1C[C@@H]([C@H]([C@@H](O1)O[C@H]2[C@H](C...,J01FA08


In [108]:
drugs_2043 = pd.read_csv('pubchem_atc_2043_w.csv')
drugs_2043

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,ATC_Code
0,2081,3,1,255.74,2.8,CC(C(=O)OC(C)(C)CC1=CC=C(C=C1)Cl)N,N06AB07
1,2082,4,2,265.33,2.9,CCCSC1=CC2=C(C=C1)N=C(N2)NC(=O)OC,P02CA03
2,2083,4,4,239.31,0.3,CC(C)(C)NCC(C1=CC(=C(C=C1)O)CO)O,R03AC02
3,2088,8,6,249.10,-6.5,C(CC(O)(P(=O)(O)O)P(=O)(O)O)CN,M05BA04
4,2091,2,2,400.60,6.8,CC(C)CCCC(C)C1CCC2C1(CCCC2=CC=C3CC(CC(C3=C)O)O)C,A11CC03
...,...,...,...,...,...,...,...
2038,179344,3,1,296.32,2.0,CC(=O)O[C@H]1CC2=CC=CC=C2N(C3=CC=CC=C13)C(=O)N,N03AF04
2039,181458,1,2,215.76,,CC12CC3CC(C1)(CC(C3)(C2)N)C.Cl,N06DX01
2040,182137,2,1,237.72,2.2,CN[C@@]1(CCCCC1=O)C2=CC=CC=C2Cl,N06AX27
2041,183797,6,5,429.50,-1.0,C1CCC(CC1)[C@H](C(=O)N2CC[C@H]2C(=O)NCC3=CC=C(...,B01AE04


In [109]:
drugs = pd.concat([drugs_1984, drugs_2043])

In [111]:
drugs = drugs.drop_duplicates()

In [112]:
drugs

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,IsomericSMILES,ATC_Code
0,1,4,0,203.24,0.4,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,N06BX12
1,137,4,2,131.13,-3.8,C(CC(=O)O)C(=O)CN,L01XD04
2,176,2,1,60.05,-0.2,CC(=O)O,G01AD02
3,187,2,0,146.21,0.2,CC(=O)OCC[N+](C)(C)C,S01EB09
4,237,4,1,400.0,6.0,CCN(CC)CCCC(C)NC1=C2C=C(C=CC2=NC3=C1C=CC(=C3)C...,P01AX05
...,...,...,...,...,...,...,...
2038,179344,3,1,296.32,2.0,CC(=O)O[C@H]1CC2=CC=CC=C2N(C3=CC=CC=C13)C(=O)N,N03AF04
2039,181458,1,2,215.76,,CC12CC3CC(C1)(CC(C3)(C2)N)C.Cl,N06DX01
2040,182137,2,1,237.72,2.2,CN[C@@]1(CCCCC1=O)C2=CC=CC=C2Cl,N06AX27
2041,183797,6,5,429.5,-1.0,C1CCC(CC1)[C@H](C(=O)N2CC[C@H]2C(=O)NCC3=CC=C(...,B01AE04


In [113]:
drugs.to_csv('pubchem_atc_184933.csv', index=False)