In [12]:
import pandas as pd
import numpy as np
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn import set_config
set_config(transform_output = "pandas")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Data loading

### Keny

In [13]:
from src.features import create_features

In [5]:
descriptors = pd.read_csv("molecules_descriptors_mordred.csv").iloc[:,2:]
X_orig =descriptors.iloc[:,:-1]
Y_orig = descriptors.iloc[:,-1]

In [8]:
descriptors_and_fp = pd.read_csv("data/descriptor_fp_features.csv").iloc[:,2:]
X_orig =descriptors.iloc[:,:-1]
Y_orig = descriptors.iloc[:,-1]

### Max

In [14]:
from sklearn.metrics import r2_score, mean_squared_error, pairwise_distances
all_fp_data = pd.read_csv("fp_dataframe.csv")
all_data = pd.read_csv("molecules_descriptors_mordred.csv")

fp_data =  all_fp_data.drop(all_fp_data.columns[[0, 1, -1]], axis=1)

pw_dist = pairwise_distances(fp_data, n_jobs = -1)
avg_pw_dist = np.mean(pw_dist, axis=0).reshape(-1, 1)

all_data.insert(3, "avg_pw_dist", avg_pw_dist)
X_orig =all_data.iloc[:,2:-1]
Y_orig = all_data.iloc[:,-1]


### Final Data preprocessing

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_orig, Y_orig,
                                                    test_size=0.4, random_state=2)

## Pipeline construction

In [18]:
from sklearn.gaussian_process import GaussianProcessRegressor,kernels
rbf_kernel = kernels.RBF(1.0, length_scale_bounds=(1e-3, 1e5))*1.0
noise_kernel = 1* kernels.WhiteKernel(noise_level=1, noise_level_bounds=(1e-1, 1e2))
periodic_kernel = kernels.ExpSineSquared(length_scale=1, periodicity=1)
full_kernel = rbf_kernel+noise_kernel*periodic_kernel

In [76]:
from sklearn.model_selection import GridSearchCV
k_best_preselection = [200,500,"all"]
sfs_features = [1,10,100,300]
param_grid = [
    {
        "k_best__k":k_best_preselection,
        "sfs__n_features_to_select":sfs_features,
    }
]

In [77]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold,SelectKBest,f_regression,SequentialFeatureSelector
from sklearn import linear_model

clf1 = linear_model.PoissonRegressor(max_iter=10000)
estimators = [('standard_scaler',StandardScaler()),
                ('k_best', SelectKBest(f_regression)), 
                ('sfs',SequentialFeatureSelector(clf1, direction="forward", cv = 3, n_jobs=-1,scoring="r2")),
                ("gaussian_process_regressor",GaussianProcessRegressor(kernel=full_kernel, random_state=1, alpha=0))               
              ]
full_model = Pipeline(estimators,memory="cache/")
grid = GridSearchCV(full_model,scoring="r2", n_jobs=-1, param_grid=param_grid,cv=2,verbose=2)

In [78]:
grid.fit(X_train, y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


2 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\keny\Nextcloud2\Uni\Chinf\group_project\chin_group_project\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\keny\Nextcloud2\Uni\Chinf\group_project\chin_group_project\.venv\lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\keny\Nextcloud2\Uni\Chinf\group_project\chin_group_project\.venv\lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)


In [58]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_k_best__k,param_sfs__n_features_to_select,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,7.897408,0.078082,0.025374,0.004882,200,1,"{'k_best__k': 200, 'sfs__n_features_to_select'...",0.478811,0.640881,0.559846,0.081035,6
1,86.427924,0.045872,0.026353,0.001952,200,10,"{'k_best__k': 200, 'sfs__n_features_to_select'...",0.569949,0.748629,0.659289,0.08934,1
2,693.941849,11.511228,0.019519,0.003903,200,100,"{'k_best__k': 200, 'sfs__n_features_to_select'...",0.390465,0.739186,0.564825,0.17436,5
3,19.216612,0.045263,0.02623,0.002073,500,1,"{'k_best__k': 500, 'sfs__n_features_to_select'...",0.478811,0.640881,0.559846,0.081035,6
4,0.37088,0.012688,0.030255,0.000977,500,10,"{'k_best__k': 500, 'sfs__n_features_to_select'...",0.47047,0.756757,0.613613,0.143143,2
5,1778.722728,16.447673,0.014638,0.000975,500,100,"{'k_best__k': 500, 'sfs__n_features_to_select'...",0.391917,0.721845,0.556881,0.164964,9
6,34.331022,0.095648,0.030743,0.005368,all,1,"{'k_best__k': 'all', 'sfs__n_features_to_selec...",0.478811,0.640881,0.559846,0.081035,6
7,276.593944,1.071516,0.025864,0.004392,all,10,"{'k_best__k': 'all', 'sfs__n_features_to_selec...",0.47047,0.748442,0.609456,0.138986,3
8,2287.638228,54.715122,0.011218,0.000485,all,100,"{'k_best__k': 'all', 'sfs__n_features_to_selec...",0.492783,0.721665,0.607224,0.114441,4


In [85]:
grid.best_estimator_[:-1].get_feature_names_out()

array(['ATS6m', 'AATS0v', 'GATS1i', 'RNCG', 'IC5', 'Kier2',
       'FilterItLogS', 'MPC7', 'n6aRing', 'SLogP'], dtype=object)

In [80]:
grid.best_estimator_.score(X_train, y_train)

0.7894151521089083

In [81]:

grid.best_estimator_.score(X_test, y_test)

0.7414735560223737