### Импорт необходимых библиотек

 Импортируем библиотеки для работы с данными (`pandas`, `numpy`), сериализации (`json`, `joblib`) и предобработки признаков (`VarianceThreshold`, `MinMaxScaler`) из библиотеки `scikit-learn`.


In [21]:
import pandas as pd
import numpy as np
import json

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
import joblib

### Загрузка обучающих дескрипторов

Загружаем различные типы дескрипторов для обучающей выборки по индексному столбцу `CASRN`:
- ECFP6 в битовом и количественном представлении
- MACCS ключи
- RDKit 2D дескрипторы
- Mordred дескрипторы


In [22]:
train_ecfp6_bits = pd.read_csv('../data/descriptors/train_ecfp6_bits.csv', index_col='CASRN')
train_ecfp6_counts = pd.read_csv('../data/descriptors/train_ecfp6_counts.csv', index_col='CASRN')
train_maccs = pd.read_csv('../data/descriptors/train_maccs.csv', index_col='CASRN')
train_rdkit2d = pd.read_csv('../data/descriptors/train_rdkit2d.csv', index_col='CASRN')
train_mordred = pd.read_csv('../data/descriptors/train_mordred.csv', index_col='CASRN')

### Функция для отбора признаков

Определяем функцию `feature_selection`, которая удаляет признаки с нулевой дисперсией и высоко коррелированные признаки (порог по умолчанию — 0.95). Это помогает уменьшить размерность и избежать мультиколлинеарности.


In [23]:
def feature_selection(df, nonzero_thrd = 0.0, cor_thrd = 0.95):

    selector = VarianceThreshold(nonzero_thrd)
    selector.fit(df)
    nonzero_df = df[df.columns[selector.get_support(indices=True)]]
    corr_matrix = nonzero_df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > cor_thrd)]
    
    return nonzero_df.drop(nonzero_df[to_drop], axis=1)

### Применение отбора признаков к обучающим данным

Применяем функцию `feature_selection` ко всем обучающим дескрипторам и выводим размеры отфильтрованных наборов признаков.


In [24]:
filtered_train_ecfp6_bits = feature_selection(train_ecfp6_bits)
filtered_train_ecfp6_counts = feature_selection(train_ecfp6_counts)
filtered_train_maccs = feature_selection(train_maccs)
filtered_train_rdkit2d = feature_selection(train_rdkit2d)
filtered_train_mordred = feature_selection(train_mordred)

In [25]:
filtered_train_ecfp6_bits.shape, filtered_train_ecfp6_counts.shape,filtered_train_maccs.shape

((8221, 2048), (8221, 1561), (8221, 145))

In [26]:
filtered_train_mordred.shape, filtered_train_rdkit2d.shape

((8221, 462), (8221, 165))

Создаём словарь `dict_features`, в котором для каждого типа дескрипторов сохраняем список имён признаков, прошедших отбор. Сохраняем этот словарь в JSON-файл для последующего использования при обработке тестовых данных.


In [27]:
desc = ['ecfp6_bits', 'ecfp6_counts', 'maccs', 'rdkit2d', 'mordred']
filtered_features = [
    list(filtered_train_ecfp6_bits),
    list(filtered_train_ecfp6_counts),
    list(filtered_train_maccs),
    list(filtered_train_rdkit2d),
    list(filtered_train_mordred),
]

dict_features = dict(zip(desc, filtered_features))

### Сохранение отобранных признаков

In [28]:
with open('../data/descriptors/filtered_features.json', 'w') as f:
    json.dump(dict_features, f)


### Нормализация числовых признаков

Определяем две функции:
- `feature_norm_fit` — выполняет масштабирование признаков обучающей выборки в диапазон [0, 1] и возвращает обученный `MinMaxScaler`. 

- `feature_norm_transform` — масштабирует тестовые признаки с использованием уже обученного скейлера.
Нормализация применяется только к дескрипторам `rdkit2d` и `mordred`.


In [29]:
def feature_norm_fit(train_df , scaler = MinMaxScaler()):
    array =  train_df.values
    df_norm = pd.DataFrame(scaler.fit_transform(array), columns=train_df.columns, index=train_df.index)
    return df_norm, scaler

In [30]:
def feature_norm_transform(test_df, scaler):
    array =  test_df.values
    df_norm = pd.DataFrame(scaler.transform(array), columns=test_df.columns, index=test_df.index)
    return df_norm  

### Нормализация и сохранение скейлера для RDKit2D

Применяем нормализацию к отфильтрованным RDKit2D дескрипторам и сохраняем обученный `MinMaxScaler` с помощью `joblib`.


In [31]:
norm_train_rdkit2d, scaler_rdkit2d = feature_norm_fit(filtered_train_rdkit2d)

In [32]:
scaler_rdkit2d.get_params

<bound method BaseEstimator.get_params of MinMaxScaler()>

In [33]:
norm_train_rdkit2d.head(1)

Unnamed: 0_level_0,BalabanJ,BertzCT,Chi0,Chi3v,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,qed
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23233-88-7,0.286258,0.112142,0.133237,0.157641,0.018166,0.040446,0.0,0.0,0.086767,0.05297,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382204


In [34]:
joblib.dump(scaler_rdkit2d, '../data/Descriptors/scaler_rdkit2d.pkl') 

['../data/Descriptors/scaler_rdkit2d.pkl']

In [35]:
test_scaler = joblib.load('../data/Descriptors/scaler_rdkit2d.pkl')

In [36]:
test_rdkit2d = feature_norm_transform(filtered_train_rdkit2d, test_scaler)

In [37]:
test_rdkit2d.head(1) 

Unnamed: 0_level_0,BalabanJ,BertzCT,Chi0,Chi3v,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,qed
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23233-88-7,0.286258,0.112142,0.133237,0.157641,0.018166,0.040446,0.0,0.0,0.086767,0.05297,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382204


In [38]:
norm_train_mordred, scaler_mordred = feature_norm_fit(filtered_train_mordred)
joblib.dump(scaler_mordred, '../data/descriptors/scaler_mordred.pkl')

['../data/descriptors/scaler_mordred.pkl']

In [39]:
test_scaler_mordred = joblib.load('../data/descriptors/scaler_mordred.pkl')

In [40]:
test_mordred = feature_norm_transform(filtered_train_mordred, test_scaler_mordred)
test_mordred.head(1)

Unnamed: 0_level_0,nAcid,nBase,SpAbs_A,SpMax_A,SpMAD_A,LogEE_A,VE1_A,VR1_A,nAromAtom,nAtom,...,JGI6,JGI7,JGI8,JGI9,JGI10,JGT10,SRW03,SRW05,TSRW10,WPath
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23233-88-7,0.0,0.0,0.13119,0.749405,0.859231,0.622256,0.397997,1.990715e-08,0.25,0.096346,...,0.363075,0.241979,0.338535,0.456364,0.471193,0.540906,0.0,0.0,0.244308,0.004741


In [41]:
norm_train_mordred.head(1)

Unnamed: 0_level_0,nAcid,nBase,SpAbs_A,SpMax_A,SpMAD_A,LogEE_A,VE1_A,VR1_A,nAromAtom,nAtom,...,JGI6,JGI7,JGI8,JGI9,JGI10,JGT10,SRW03,SRW05,TSRW10,WPath
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23233-88-7,0.0,0.0,0.13119,0.749405,0.859231,0.622256,0.397997,1.990715e-08,0.25,0.096346,...,0.363075,0.241979,0.338535,0.456364,0.471193,0.540906,0.0,0.0,0.244308,0.004741


In [43]:
filtered_train_ecfp6_bits.to_csv('../data/Bmodel_features/modeling_train_ecfp6_bits.csv')
filtered_train_ecfp6_counts.to_csv('../data/Bmodel_features/modeling_train_ecfp6_counts.csv')
filtered_train_maccs.to_csv('../data/Bmodel_features/modeling_train_maccs.csv')
norm_train_rdkit2d.to_csv('../data/Bmodel_features/modeling_train_rdkit2d.csv')
norm_train_mordred.to_csv('../data/Bmodel_features/modeling_train_mordred.csv')

Now Let's handle the test features

In [46]:
def test_feature(df, feature, scaler = None):
    '''
    transform the raw (computed) feature into the fowmat ready for modeling.
    
    df: test features
    scaler: for rdkit2d and mordred
    feature: name of the feature ['ecfp6_bits', 'ecfp6_counts', 'maccs', 'rdkit2d', 'mordred']
    '''
    
    with open('../data/Descriptors/filtered_features.json') as f:
        dict_features = json.load(f)
        
    if feature not in dict_features.keys():
        raise Exception(f'The feature **{feature}** is not support, please choose from [ecfp6_bits, ecfp6_counts, maccs, rdkit2d, mordred]')
        
    filtered_desc = dict_features[feature]
    df = df[filtered_desc]
    
    if scaler:
        df = feature_norm_transform(df, scaler)
    
    return df

In [47]:
# import all the test data
test_ecfp6_bits = pd.read_csv('../data/Descriptors/test_ecfp6_bits.csv', index_col='CASRN')
test_ecfp6_counts = pd.read_csv('../data/Descriptors/test_ecfp6_counts.csv', index_col='CASRN')
test_maccs = pd.read_csv('../data/Descriptors/test_maccs.csv', index_col='CASRN')
test_rdkit2d = pd.read_csv('../data/Descriptors/test_rdkit2d.csv', index_col='CASRN')
test_mordred = pd.read_csv('../data/Descriptors/test_mordred.csv', index_col='CASRN')

In [48]:
filtered_test_ecfp6_bits = test_feature(test_ecfp6_bits, 'ecfp6_bits')
filtered_test_ecfp6_counts = test_feature(test_ecfp6_counts, 'ecfp6_counts')
filtered_test_maccs = test_feature(test_maccs, 'maccs')

In [49]:
filtered_test_ecfp6_bits.shape, filtered_test_ecfp6_counts.shape, filtered_test_maccs.shape

((2849, 2048), (2849, 1561), (2849, 145))

In [50]:
scaler_mordred = joblib.load('../data/descriptors/scaler_mordred.pkl')
scaler_rdkit2d = joblib.load('../data/descriptors/scaler_rdkit2d.pkl') 

In [51]:
filtered_test_rdkit2d = test_feature(test_rdkit2d, 'rdkit2d', scaler_rdkit2d)

In [52]:
filtered_test_rdkit2d.head(1)

Unnamed: 0_level_0,BalabanJ,BertzCT,Chi0,Chi3v,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,qed
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
130209-82-4,0.207313,0.100235,0.185335,0.22792,0.055726,0.16968,0.0,0.195699,0.129578,0.100139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012987,0.0,0.254728


In [53]:
filtered_test_mordred = test_feature(test_mordred, 'mordred', scaler_mordred)

In [54]:
filtered_test_mordred.head(1)


Unnamed: 0_level_0,nAcid,nBase,SpAbs_A,SpMax_A,SpMAD_A,LogEE_A,VE1_A,VR1_A,nAromAtom,nAtom,...,JGI6,JGI7,JGI8,JGI9,JGI10,JGT10,SRW03,SRW05,TSRW10,WPath
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
130209-82-4,0.0,0.0,0.186544,0.750389,0.867067,0.682724,0.335865,2.993004e-07,0.125,0.225914,...,0.252108,0.1689,0.266486,0.273758,0.249418,0.412305,0.0,0.437189,0.351957,0.015544


In [55]:
filtered_test_mordred.shape, filtered_test_rdkit2d.shape

((2849, 462), (2849, 165))

In [56]:
filtered_test_ecfp6_bits.to_csv('../data/Bmodel_features/modeling_test_ecfp6_bits.csv')
filtered_test_ecfp6_counts.to_csv('../data/Bmodel_features/modeling_test_ecfp6_counts.csv')
filtered_test_maccs.to_csv('../data/Bmodel_features/modeling_test_maccs.csv')
filtered_test_rdkit2d.to_csv('../data/Bmodel_features/modeling_test_rdkit2d.csv')
filtered_test_mordred.to_csv('../data/Bmodel_features/modeling_test_mordred.csv')