In [None]:
!pip install numpy<2
!pip install rdkit-pypi -q


/bin/bash: line 1: 2: No such file or directory
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem
from google.colab import drive

drive.mount('/content/drive')
train_df_path = '/content/drive/MyDrive/dacon_1/train.csv'
candidates_df_path = '/content/drive/MyDrive/dacon_1/input_candidates_01_1.csv'

train_df = pd.read_csv(train_df_path)
candidates_df = pd.read_csv(candidates_df_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def smiles_to_mol(smiles):
    try:
        return Chem.MolFromSmiles(smiles)
    except:
        return None

train_df['Mol'] = train_df['Canonical_Smiles'].apply(smiles_to_mol)

def calculate_rdkit_features(mol):
    if not mol:
        return {
            'ExactMolWt': None,
            'HeavyAtomCount': None,
            'NumAtoms': None,
            'NumValenceElectrons': None,
            'MolMR': None,
            'MaxPartialCharge': None,
            'MinPartialCharge': None,
            'FractionCSP3': None,
            'RingCount': None,
            'NumAromaticRings': None,
            'NumAliphaticRings': None,
            'NumSaturatedRings': None,
            'NumUnsaturatedRings': None,

            'Num_C': None, 'Num_O': None, 'Num_N': None, 'Num_S': None, 'Num_P': None,
            'Num_F': None, 'Num_Cl': None, 'Num_Br': None, 'Num_I': None,

            'Num_Amide': None, 'Num_Sulfonamide': None,

            'Num_Alcohol': None, 'Num_Amine': None, 'Num_CarboxylicAcid': None,
            'Num_Ester': None, 'Num_Ketone': None
        }

    features = {}

    features['ExactMolWt'] = Descriptors.ExactMolWt(mol)

    features['HeavyAtomCount'] = Descriptors.HeavyAtomCount(mol)
    features['NumAtoms'] = mol.GetNumAtoms()

    features['NumValenceElectrons'] = Descriptors.NumValenceElectrons(mol)

    features['MolMR'] = Descriptors.MolMR(mol)

    try:
        AllChem.ComputeGasteigerCharges(mol)
        charges = [atom.GetDoubleProp('_GasteigerCharge') for atom in mol.GetAtoms() if not np.isnan(atom.GetDoubleProp('_GasteigerCharge'))]
        features['MaxPartialCharge'] = max(charges) if charges else None
        features['MinPartialCharge'] = min(charges) if charges else None
    except:
        features['MaxPartialCharge'] = None
        features['MinPartialCharge'] = None

    features['FractionCSP3'] = Descriptors.FractionCSP3(mol)

    features['RingCount'] = Descriptors.RingCount(mol)
    features['NumAromaticRings'] = rdMolDescriptors.CalcNumAromaticRings(mol)
    features['NumAliphaticRings'] = rdMolDescriptors.CalcNumAliphaticRings(mol)
    features['NumSaturatedRings'] = rdMolDescriptors.CalcNumSaturatedRings(mol)

    features['NumUnsaturatedRings'] = features['RingCount'] - features['NumSaturatedRings']

    elements_to_count = ['C', 'O', 'N', 'S', 'P', 'F', 'Cl', 'Br', 'I']
    for element in elements_to_count:
        features[f'Num_{element}'] = sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == element)

    # (SMARTS) (이전과 동일)
    amide_smarts = Chem.MolFromSmarts('C(=O)N')
    sulfonamide_smarts = Chem.MolFromSmarts('S(=O)(=O)N')
    func_groups = {
        'Alcohol': Chem.MolFromSmarts('[OX2H]'),
        'Amine': Chem.MolFromSmarts('[NX3;H2,H1;!$(NC=O)]'),
        'CarboxylicAcid': Chem.MolFromSmarts('C(=O)[OH]'),
        'Ester': Chem.MolFromSmarts('C(=O)O[#6]'),
        'Ketone': Chem.MolFromSmarts('O=[C]([!$([C]=O)])[!$([C]=O)]')
    }

    features['Num_Amide'] = len(mol.GetSubstructMatches(amide_smarts)) if amide_smarts else None
    features['Num_Sulfonamide'] = len(mol.GetSubstructMatches(sulfonamide_smarts)) if sulfonamide_smarts else None

    for fg_name, fg_smarts in func_groups.items():
        features[f'Num_{fg_name}'] = len(mol.GetSubstructMatches(fg_smarts)) if fg_smarts else None

    return features

rdkit_features_df = train_df['Mol'].apply(calculate_rdkit_features).apply(pd.Series)

# delete Mol (after creating features)
train_df = train_df.drop(columns=['Mol'])


train_df_with_rdkit = pd.concat([train_df, rdkit_features_df], axis=1)

rdkit_features_for_merge = train_df_with_rdkit.set_index('ID').drop(columns=['Canonical_Smiles', 'Inhibition']) # Inhibition 열을 제외

candidates_01_1_with_rdkit_features = pd.merge(
    candidates_df,
    rdkit_features_for_merge,
    on='ID',
    how='left'
)


output_candidates_path = '/content/drive/MyDrive/dacon_1/candidates_03_1.csv'
candidates_01_1_with_rdkit_features.to_csv(output_candidates_path, index=False)