# **Загрузка датасета**

In [None]:
import pandas as pd
import numpy as np
!pip install rdkit

In [None]:
#загрузка датасета
data_1 = pd.read_csv("data/Maragakis et al DUDE docking scores and vortex properties.csv")
data_1

In [None]:
#выбор таргета aldr
data = data_1[data_1["target"] == "aldr"]

In [None]:
#удаляем малоинформативные колонки
columns_to_drop = ['ID', 'MW', 'LIPINSKI_COUNT', 'RO3_COUNT']
data = data.drop(columns_to_drop, axis=1, inplace=False)

# **Расчет физико-химических дискрипторов**

In [None]:
def add_physcem_descriptors(data, smiles_col="SMILES"):
  def calc(mol):
    return {
        # молекулярная рефрактивность
         "mol_refractivity": Descriptors.MolMR(mol),
          # формальный заряд
          "formal_charge": Chem.GetFormalCharge(mol)
    }

  mols = [Chem.MolFromSmiles(s) for s in data['SMILES']]
  new_data = [calc(m) for m in mols]
  return pd.concat([data, pd.DataFrame(new_data, index=data.index)], axis=1)

In [None]:
data = add_physcem_descriptors(data, smiles_col='SMILES')

# **Расчет Топологических дискрипторов**

In [None]:
def add_topology_descriptors(data, smiles_col="SMILES"):
  def calc(mol):
    return {
        "balaban_j": GraphDescriptors.BalabanJ(mol),
        "bertz_ct":  Descriptors.BertzCT(mol),
        "kappa1": rdMolDescriptors.CalcKappa1(mol),
        "kappa2": rdMolDescriptors.CalcKappa2(mol),
        "kappa3": rdMolDescriptors.CalcKappa3(mol),
        "chi0v": rdMolDescriptors.CalcChi0v(mol),
        "chi1v": rdMolDescriptors.CalcChi1v(mol),
        "chi2v": rdMolDescriptors.CalcChi2v(mol),
        "chi3v": rdMolDescriptors.CalcChi3v(mol),
        "labute_asa": rdMolDescriptors.CalcLabuteASA(mol)
    }

  mols = [Chem.MolFromSmiles(s) for s in data['SMILES']]
  new_data = [calc(m) for m in mols]
  return pd.concat([data, pd.DataFrame(new_data, index=data.index)], axis=1)

In [None]:
data = add_topology_descriptors(data, smiles_col='SMILES')

# **Рассчет площади поверхности по диапазонам гидрофобности, площади поверхности по диапазонам мол. рефрактивности, площади поверхности по диапазонам частичных зарядов**

In [None]:
from rdkit.Chem import Descriptors

def add_surface(data: pd.DataFrame, smiles_col: str = "SMILES") -> pd.DataFrame:
    # Берём все дескрипторы, у которых имя начинается с нужных префиксов
    wanted = ["SlogP_VSA", "SMR_VSA", "PEOE_VSA"]
    descs = [(name, fn) for name, fn in Descriptors.descList
             if any(name.startswith(w) for w in wanted)]

    def calc(mol):
        if mol is None:
            return {}
        return {name: fn(mol) for name, fn in descs}

    rows = []
    for smi in data[smiles_col]:
        mol = Chem.MolFromSmiles(smi)
        rows.append(calc(mol))

    return pd.concat([data.reset_index(drop=True), pd.DataFrame(rows)], axis=1)

In [None]:
data = add_surface(data, smiles_col='SMILES')

## **Рассчет кол-ва функциональных групп**

In [None]:
def add_functional_descriptors(data, smiles_col="SMILES"):
  def calc(mol):

    num_amide = rdMolDescriptors.CalcNumAmideBonds(mol)
    num_alif_carboxy = rdMolDescriptors.CalcNumAliphaticCarbocycles(mol)
    num_hetero_cycles = rdMolDescriptors.CalcNumAromaticHeterocycles(mol)
    num_spiro_atoms = rdMolDescriptors.CalcNumSpiroAtoms(mol)

    return {
        "NumAmideBonds": num_amide,
        "NumAlifCarboxy": num_alif_carboxy,
        "NumHeteroCycles": num_hetero_cycles,
        "NumSpiroAtoms": num_spiro_atoms
   }
  mols = [Chem.MolFromSmiles(s) for s in data[smiles_col]]
  new_rows = [calc(m) for m in mols]
  return pd.concat([data.reset_index(drop=True), pd.DataFrame(new_rows)], axis=1)

In [None]:
data = add_functional_descriptors(data, smiles_col='SMILES')

# **Сохранение итогового датасета фичей без фингерпринтов**

In [None]:
from google.colab import files

data.to_csv("result_features_no_fingerprint.csv", index=False)
files.download("result_features_no_fingerprint.csv")

# **Вычисление фингерпринтов MACCS**

In [None]:
#загрузка датасета
data_1 = pd.read_csv("data/Maragakis et al DUDE docking scores and vortex properties.csv")
data_1

In [None]:
data = data_1[['SMILES', 'score']]

In [None]:
from rdkit import Chem
from rdkit.Chem import MACCSkeys

def add_maccs_fingerprints(data, smiles_col="SMILES"):
  def calc_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = MACCSkeys.GenMACCSKeys(mol)
    bits  = fp.ToBitString()
    return {f"maccs_{i}": int(b) for i, b in enumerate(bits)}

  fps = [calc_fp(s) for s in data[smiles_col]]
  return pd.concat([data, pd.DataFrame(fps, index=data.index)], axis=1)

In [None]:
data = add_maccs_fingerprints(data, smiles_col='SMILES')

In [None]:
data = data.drop('SMILES', axis=1)

# **Вычисление фингерпринтов Morgan**

In [None]:
#загрузка датасета
data_1 = pd.read_csv("data/Maragakis et al DUDE docking scores and vortex properties.csv")
data_1

In [None]:
data = data_1[['SMILES', 'score']]

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem

def add_morgan_fingerprints(data, smiles_col="SMILES", radius=2, n_bits=1024):
    def calc_fp(smiles):
        mol = Chem.MolFromSmiles(smiles)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        bits = fp.ToBitString()
        return {f"morgan_{i}": int(b) for i, b in enumerate(bits)}

    fps = [calc_fp(s) for s in data[smiles_col]]
    fps_df = pd.DataFrame(fps, index=data.index)
    return pd.concat([data, fps_df], axis=1)

In [None]:
data = add_morgan_fingerprints(data, smiles_col="SMILES")

In [None]:
data = data.drop('SMILES', axis=1)