In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.6


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
from rdkit.Chem.Crippen import MolLogP
from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from tqdm import tqdm
import pickle
# Pytorch
import torch
from torch.nn import Linear, MSELoss
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Machine_Learn/dataset_v1.csv', nrows=13000)
df

Unnamed: 0,SMILES,SPLIT
0,CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1,train
1,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1,train
2,CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1,test
3,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO,train
4,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C,train
...,...,...
12995,CCOC(=O)N1CCC(NC(=O)c2cccs2)CC1,train
12996,CCOC(=O)N1CCC(NC(=O)c2ccc(Cl)cc2)CC1,train
12997,CCOC(=O)N1CCC(NC(=O)c2cc(C)on2)CC1,train
12998,CCOC(=O)N1CCC(NC(=O)c2snnc2C)CC1,train


In [None]:
# random = df.sample(frac=10/100)
# random.reset_index(drop=True, inplace=True)
# random

Unnamed: 0,SMILES,SPLIT
0,CCc1noc(CC)c1CNC(=O)c1cccc2cc[nH]c12,train
1,Cc1cccc(OCC(O)Cn2c(C3CC3)nc3ccccc32)c1,test
2,CNC(=O)CCCN(C)C(=O)Nc1ccccc1-n1nc(C)nc1C,train
3,Nc1ccc(C(=O)Nc2ccc(-c3nc4cc(N)ccc4o3)cc2)cc1,train
4,Cc1ccc(OCc2nnc3n2CCCCC3)cc1,train
...,...,...
193691,CCCNC(=O)NCCCN(C)S(C)(=O)=O,test
193692,O=C(NCCCNc1cc(=O)oc2ccccc12)c1ccco1,train
193693,CS(=O)(=O)NC1CCN(C(=O)NCCN2CCOCC2)CC1,train
193694,CCCC(C#N)NC(=O)c1cccc(-c2csc(C)n2)c1,test_scaffolds


In [None]:
num_nan_rows = df.isnull().any(axis=1).sum()

print("Количество строк с пропущенными значениями (NaN):", num_nan_rows)

Количество строк с пропущенными значениями (NaN): 0


In [None]:
def get_mol(smiles_or_mol):
    '''
    Loads SMILES/molecule into RDKit's object
    '''
    if isinstance(smiles_or_mol, str):
        if len(smiles_or_mol) == 0:
            return None
        mol = Chem.MolFromSmiles(smiles_or_mol)
        if mol is None:
            return None
        try:
            Chem.SanitizeMol(mol)
        except ValueError:
            return None
        return mol
    return smiles_or_mol

def get_np_data(df):
    '''
    Convert SMILES in pandas DataFrame to numpy arrays
    На этом этапе мы подгружаем свойство LogP из rdkit
    '''
    train = []
    test = []
    test_scaf = []

    for sid, smiles in enumerate(df.SMILES):
        mol = get_mol(smiles) # create rdkit molecule from SMILES
        maccs = GetMACCSKeysFingerprint(mol).ToList() # get fingerprints
        logp = MolLogP(mol) # calculate logp

        if df.SPLIT[sid] == 'train':
            train.append(maccs + [logp])
        elif df.SPLIT[sid] == 'test':
            test.append(maccs + [logp])
        elif df.SPLIT[sid] == 'test_scaffolds':
            test_scaf.append(maccs + [logp])

    train = np.array(train, dtype=np.float32)
    test = np.array(test, dtype=np.float32)
    test_scaf = np.array(test_scaf, dtype=np.float32)

    return train, test, test_scaf


In [None]:
train, test, test_scaf = get_np_data(df)

In [None]:
train.shape, test.shape, test_scaf.shape

((10743, 168), (1153, 168), (1104, 168))

In [None]:
train = pd.DataFrame(train)
train.to_csv('/content/drive/MyDrive/Machine_Learn/New_train.csv', index=False)


In [None]:
test = pd.DataFrame(test)
test.to_csv('/content/drive/MyDrive/Machine_Learn/New_test.csv')
test_scaf = pd.DataFrame(test_scaf)
test_scaf.to_csv('/content/drive/MyDrive/Machine_Learn/New_test_scaf.csv')