In [33]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, Draw

In [50]:
df = pd.read_csv('../Repurposing/Approved_Drugs_DB.csv', sep=';')
df.head()

Unnamed: 0,ID,Name,MW,SMILES
0,DB00006,Bivalirudin,2180.2853,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,DB00014,Goserelin,1269.4105,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
2,DB00027,Gramicidin D,1811.253,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
3,DB00035,Desmopressin,1069.22,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...
4,DB00050,Cetrorelix,1431.038,CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...


In [51]:
# Função para calcular os descritores básicos a partir de uma string SMILES
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    # Caso a conversão falhe, retorna valores nulos
    if mol is None:
        return pd.Series({
            'H_Donors': None,
            'H_Acceptors': None,
            'LogP': None,
            'TPSA': None,
            'Heavy_Atoms': None
        })
    h_donors = rdMolDescriptors.CalcNumHBD(mol)
    h_acceptors = rdMolDescriptors.CalcNumHBA(mol)
    logp = Descriptors.MolLogP(mol)
    tpsa = Descriptors.TPSA(mol)
    heavy_atoms = mol.GetNumHeavyAtoms()
    
    return pd.Series({
        'H_Donors': h_donors,
        'H_Acceptors': h_acceptors,
        'LogP': logp,
        'TPSA': tpsa,
        'Heavy_Atoms': heavy_atoms
    })

# Função para verificar a regra de Lipinski
def check_lipinski(row):
    # Critérios de Lipinski:
    # H_Donors <= 5, H_Acceptors <= 10, MW < 500, LogP <= 5
    if (row['H_Donors'] is not None and row['H_Acceptors'] is not None 
        and row['LogP'] is not None):
        if (row['H_Donors'] <= 5 and row['H_Acceptors'] <= 10 
            and row['MW'] < 500 and row['LogP'] <= 5):
            return 1
    return 0

# Calcula os descritores e adiciona como novas colunas
df_descriptors = df['SMILES'].apply(calculate_descriptors)
df = pd.concat([df, df_descriptors], axis=1)

# Aplica a verificação da regra de Lipinski e cria a coluna 'Lipinski'
df['Lipinski'] = df.apply(check_lipinski, axis=1)

[12:13:49] Explicit valence for atom # 84 N, 4, is greater than permitted


In [47]:
df

Unnamed: 0,ID,Name,MW,SMILES,H_Donors,H_Acceptors,LogP,TPSA,Heavy_Atoms,Lipinski
0,DB00006,Bivalirudin,2180.2853,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,28.0,29.0,-8.11643,901.57,155.0,0
1,DB00014,Goserelin,1269.4105,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,17.0,16.0,-3.10570,495.89,91.0,0
2,DB00027,Gramicidin D,1811.2530,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,20.0,16.0,4.86760,519.89,131.0,0
3,DB00035,Desmopressin,1069.2200,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,14.0,15.0,-4.13203,435.41,74.0,0
4,DB00050,Cetrorelix,1431.0380,CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...,17.0,16.0,-0.50613,495.67,102.0,0
...,...,...,...,...,...,...,...,...,...,...
2642,DB19375,Chlorothymol,184.6600,CC(C)C1=CC(Cl)=C(C)C=C1O,1.0,1.0,3.47742,20.23,12.0,1
2643,DB19376,Salicylanilide,213.2360,OC1=CC=CC=C1C(=O)NC1=CC=CC=C1,2.0,2.0,2.64450,49.33,16.0,1
2644,DB19378,Megestrol,342.4790,[H][C@@]12CC[C@](O)(C(C)=O)[C@@]1(C)CC[C@@]1([...,1.0,3.0,4.00450,54.37,25.0,1
2645,DB19379,Syrosingopine,666.7240,[H][C@]12C[C@@H](OC(=O)C3=CC(OC)=C(OC(=O)OCC)C...,1.0,12.0,4.69780,144.08,48.0,0


In [56]:
df.duplicated().sum()

np.int64(0)

In [52]:
df.isnull().sum()

ID             0
Name           0
MW             0
SMILES         0
H_Donors       1
H_Acceptors    1
LogP           1
TPSA           1
Heavy_Atoms    1
Lipinski       0
dtype: int64

In [53]:
df[df.isnull().any(axis=1)]

Unnamed: 0,ID,Name,MW,SMILES,H_Donors,H_Acceptors,LogP,TPSA,Heavy_Atoms,Lipinski
1820,DB09385,Cyanocobalamin Co-57,1354.399,[57Co+3].[C-]#N.C[C@H](CNC(=O)CC[C@]1(C)[C@@H]...,,,,,,0


In [57]:
df.to_csv('Approved_Drugs_DB_Descriptors.csv', index=False)

In [59]:
df_2 = pd.read_csv('../NuBBE/NuBBE_DB_MW.csv', sep=';')
df_2

Unnamed: 0,ID,MW,SMILES
0,NUBBE_1,290.354,O=C(OC/C=C(\C)/CCC=C(C)C)c1cc(O)c(O)cc1
1,NUBBE_2,290.354,O=C(c1cc(O)c(cc1)OC/C=C(\CCC=C(C)C)/C)O
2,NUBBE_3,290.354,Oc1c(OC/C=C(\C)/CCC=C(C)C)cc(C(=O)O)cc1
3,NUBBE_4,274.355,Oc1ccc(C(=O)OC/C=C(\C)/CCC=C(C)C)cc1
4,NUBBE_5,274.355,O=C(O)c1ccc(OC/C=C(\C)/CCC=C(C)C)cc1
...,...,...,...
10827,NUBBE_10828,856.018,O=C1N([C@@H](C(=O)N[C@H](C(=O)NCCCC[C@H](C(=O)...
10828,NUBBE_10829,878.029,O=C1N[C@H](C(=O)N[C@@H](C(=O)N[C@H](C(=O)NCCCC...
10829,NUBBE_10830,998.172,O=C1O[C@H]([C@H](NC(=O)[C@H](NC(=O)[C@@H]2N(C(...
10830,NUBBE_10831,377.390,O=C1C(=C(C(=O)c2c1cccc2)c1ccc(cc1)C=O)OC(=O)N(...


In [60]:
# Função para calcular os descritores básicos a partir de uma string SMILES
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    # Caso a conversão falhe, retorna valores nulos
    if mol is None:
        return pd.Series({
            'H_Donors': None,
            'H_Acceptors': None,
            'LogP': None,
            'TPSA': None,
            'Heavy_Atoms': None
        })
    h_donors = rdMolDescriptors.CalcNumHBD(mol)
    h_acceptors = rdMolDescriptors.CalcNumHBA(mol)
    logp = Descriptors.MolLogP(mol)
    tpsa = Descriptors.TPSA(mol)
    heavy_atoms = mol.GetNumHeavyAtoms()
    
    return pd.Series({
        'H_Donors': h_donors,
        'H_Acceptors': h_acceptors,
        'LogP': logp,
        'TPSA': tpsa,
        'Heavy_Atoms': heavy_atoms
    })

# Função para verificar a regra de Lipinski
def check_lipinski(row):
    # Critérios de Lipinski:
    # H_Donors <= 5, H_Acceptors <= 10, MW < 500, LogP <= 5
    if (row['H_Donors'] is not None and row['H_Acceptors'] is not None 
        and row['LogP'] is not None):
        if (row['H_Donors'] <= 5 and row['H_Acceptors'] <= 10 
            and row['MW'] < 500 and row['LogP'] <= 5):
            return 1
    return 0

# Calcula os descritores e adiciona como novas colunas
df_descriptors = df_2['SMILES'].apply(calculate_descriptors)
df_2 = pd.concat([df_2, df_descriptors], axis=1)

# Aplica a verificação da regra de Lipinski e cria a coluna 'Lipinski'
df_2['Lipinski'] = df_2.apply(check_lipinski, axis=1)

[13:50:47] Explicit valence for atom # 34 N, 4, is greater than permitted
[13:50:47] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:50:48] Can't kekulize mol.  Unkekulized atoms: 1 2 5 6 7 8 12 13 14 15 16 17 18 19 24
[13:50:48] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:50:48] Explicit valence for atom # 0 N, 4, is greater than permitted
[13:50:48] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:50:48] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:50:48] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:50:48] Explicit valence for atom # 0 N, 4, is greater than permitted
[13:50:48] Explicit valence for atom # 0 N, 4, is greater than permitted
[13:50:48] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:50:48] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:50:48] Explicit valence for atom # 0 N, 4, is greater than permitted
[13:50:49] Explicit valence for a

In [61]:
df_2

Unnamed: 0,ID,MW,SMILES,H_Donors,H_Acceptors,LogP,TPSA,Heavy_Atoms,Lipinski
0,NUBBE_1,290.354,O=C(OC/C=C(\C)/CCC=C(C)C)c1cc(O)c(O)cc1,2.0,4.0,3.94730,66.76,21.0,1
1,NUBBE_2,290.354,O=C(c1cc(O)c(cc1)OC/C=C(\CCC=C(C)C)/C)O,2.0,3.0,4.16190,66.76,21.0,1
2,NUBBE_3,290.354,Oc1c(OC/C=C(\C)/CCC=C(C)C)cc(C(=O)O)cc1,2.0,3.0,4.16190,66.76,21.0,1
3,NUBBE_4,274.355,Oc1ccc(C(=O)OC/C=C(\C)/CCC=C(C)C)cc1,1.0,3.0,4.24170,46.53,20.0,1
4,NUBBE_5,274.355,O=C(O)c1ccc(OC/C=C(\C)/CCC=C(C)C)cc1,1.0,2.0,4.45630,46.53,20.0,1
...,...,...,...,...,...,...,...,...,...
10827,NUBBE_10828,856.018,O=C1N([C@@H](C(=O)N[C@H](C(=O)NCCCC[C@H](C(=O)...,8.0,8.0,2.96880,235.37,62.0,0
10828,NUBBE_10829,878.029,O=C1N[C@H](C(=O)N[C@@H](C(=O)N[C@H](C(=O)NCCCC...,12.0,9.0,-0.33233,314.93,63.0,0
10829,NUBBE_10830,998.172,O=C1O[C@H]([C@H](NC(=O)[C@H](NC(=O)[C@@H]2N(C(...,7.0,13.0,-0.23200,305.28,71.0,0
10830,NUBBE_10831,377.390,O=C1C(=C(C(=O)c2c1cccc2)c1ccc(cc1)C=O)OC(=O)N(...,0.0,5.0,3.76780,80.75,28.0,1


In [63]:
df_2[df_2.isnull().any(axis=1)]

Unnamed: 0,ID,MW,SMILES,H_Donors,H_Acceptors,LogP,TPSA,Heavy_Atoms,Lipinski
1440,NUBBE_1441,638.837,C[C@]12[C@@]3(C)[C@H]([C@H](CC3)[C@@]3(C)O[C@@...,,,,,,0
2318,NUBBE_2319,584.399,O=[N]([O-])CCC(=O)O[C@@H]1[C@H](O[C@@H]([C@@H]...,,,,,,0
3966,NUBBE_3967,336.361,O1c2c(OC1)cc1c(-[c]3n(CC1)cc1c(c3)ccc(c1OC)OC)c2,,,,,,0
3993,NUBBE_3994,341.272,O=[N]([O-])c1c2c(c3c(c1)c(OC)ccc3)c1OCOc1cc2C(...,,,,,,0
4067,NUBBE_4068,334.236,[N](=O)([O-])c1c2c(cc3c(c2c2c(c1)cccc2)OCO3)C(...,,,,,,0
...,...,...,...,...,...,...,...,...,...
10085,NUBBE_10086,634.805,O=[N]([O-])c1c(N/N=C/2\C([C@@H]3[C@](CC2)([C@@...,,,,,,0
10086,NUBBE_10087,636.778,O=[N]([O-])c1c(N/N=C/2\C([C@@H]3[C@](CC2)([C@@...,,,,,,0
10161,NUBBE_10162,369.434,O=C(O)C[C@]12C3=[N](CCC1)CC[C@@]13c3c(N([C@H]1...,,,,,,0
10341,NUBBE_10342,467.427,O=[N]([O-])c1c(O)c(O[C@H]2O[C@H]([C@@H]([C@H](...,,,,,,0


In [65]:
df_2.to_csv('NuBBE_DB_Descriptors.csv', index=False)