1. For your dataset get 1000+ descriptors from 2+ sources: RDKit, Pubchem, Mordred, etc.

In [1]:
pip install rdkit



In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.feature_selection import SelectKBest, mutual_info_regression

In [3]:
df = pd.read_csv("qm9_new.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mol_id  20000 non-null  object 
 1   smiles  20000 non-null  object 
 2   A       20000 non-null  float64
 3   B       20000 non-null  float64
 4   C       20000 non-null  float64
 5   mu      20000 non-null  float64
 6   alpha   20000 non-null  float64
 7   homo    20000 non-null  float64
 8   lumo    20000 non-null  float64
 9   gap     20000 non-null  float64
 10  r2      20000 non-null  float64
 11  zpve    20000 non-null  float64
 12  u0      20000 non-null  float64
 13  u298    20000 non-null  float64
 14  h298    20000 non-null  float64
 15  g298    20000 non-null  float64
 16  cv      20000 non-null  float64
dtypes: float64(15), object(2)
memory usage: 2.6+ MB


In [5]:
# Извлечение химических дескрипторов с помощью RDKit
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    descriptors = {
        'MolecularWeight': Descriptors.MolWt(mol),
        'NumAtoms': Descriptors.HeavyAtomCount(mol),
        # Другие дескрипторы RDKit
    }
    return descriptors



# Применение функции к столбцу 'smiles' для извлечения дескрипторов

In [7]:
print(df.columns)

Index(['mol_id', 'smiles', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap',
       'r2', 'zpve', 'u0', 'u298', 'h298', 'g298', 'cv'],
      dtype='object')


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mol_id  20000 non-null  object 
 1   smiles  20000 non-null  object 
 2   A       20000 non-null  float64
 3   B       20000 non-null  float64
 4   C       20000 non-null  float64
 5   mu      20000 non-null  float64
 6   alpha   20000 non-null  float64
 7   homo    20000 non-null  float64
 8   lumo    20000 non-null  float64
 9   gap     20000 non-null  float64
 10  r2      20000 non-null  float64
 11  zpve    20000 non-null  float64
 12  u0      20000 non-null  float64
 13  u298    20000 non-null  float64
 14  h298    20000 non-null  float64
 15  g298    20000 non-null  float64
 16  cv      20000 non-null  float64
dtypes: float64(15), object(2)
memory usage: 2.6+ MB


In [9]:
df['Descriptors'] = df['smiles'].apply(calculate_descriptors)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mol_id       20000 non-null  object 
 1   smiles       20000 non-null  object 
 2   A            20000 non-null  float64
 3   B            20000 non-null  float64
 4   C            20000 non-null  float64
 5   mu           20000 non-null  float64
 6   alpha        20000 non-null  float64
 7   homo         20000 non-null  float64
 8   lumo         20000 non-null  float64
 9   gap          20000 non-null  float64
 10  r2           20000 non-null  float64
 11  zpve         20000 non-null  float64
 12  u0           20000 non-null  float64
 13  u298         20000 non-null  float64
 14  h298         20000 non-null  float64
 15  g298         20000 non-null  float64
 16  cv           20000 non-null  float64
 17  Descriptors  20000 non-null  object 
dtypes: float64(15), object(3)
memory usage: 2.7+ M

In [11]:
df = df.dropna(subset=['Descriptors'])

In [12]:
# Преобразование дескрипторов в отдельные столбцы
df = pd.concat([df.drop(['Descriptors'], axis=1), df['Descriptors'].apply(pd.Series)], axis=1)

In [14]:
# Определение целевой переменной
X = df.drop(['cv', 'smiles', 'mol_id'], axis=1)  # Матрица признаков
y = df['cv']  # Целевая переменная

In [15]:
# Выбор признаков с использованием SelectKBest и mutual_info_regression
selector = SelectKBest(mutual_info_regression, k=5)  # Выберите желаемое количество признаков
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

In [16]:
print("Selected features:", selected_features)

Selected features: Index(['zpve', 'u0', 'u298', 'h298', 'g298'], dtype='object')
