In [1]:
import sys
# !{sys.executable} -m pip install shap
from glob import glob
import numpy as np
import joblib
import os
import pandas as pd
import json

import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from tqdm.notebook import tqdm
import seaborn as sns
from collections import Counter

# SHAP
import shap

In [2]:
# glob(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())),'database','*.csv.gz'))

In [3]:
trainsize=[ 0.1,0.3, 0.5, 0.7,0.8]

In [4]:


gaussian_kernel = RBF()

# remove basic linear models (ridge, lasso, elastic) and expensive models (gpr)

models = {
    # 'ridge': Ridge(),
    # 'lasso': Lasso(),
    # 'elastic': ElasticNet(),
    'knn': KNeighborsRegressor(),
    'rfr': RandomForestRegressor(),
    # 'grad': GradientBoostingRegressor(),
    # 'svr': SVR(),
    'krr': KernelRidge(),
    # 'gpr': GaussianProcessRegressor()
}

param_grid = {
    'ridge': {
        'alpha': [0.001, 0.01, 0.1, 1, 10, 50, 100, 1000]
    },
    'knn': {
        'n_neighbors': [1,5,10],
        'weights': ['uniform', 'distance']
    },
    'rfr': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    },
    'grad': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.1, 0.01, 0.001],
        'max_depth': [3, 5, 7]
    },
    'svr': {
        'kernel': ['linear', 'rbf'],
        'C': [0.1, 1, 10],
        'epsilon': [0.1, 0.01, 0.001]
    },
    'krr': {
        'kernel': ['linear', 'laplacian', 'rbf'],
        'alpha': [0.001, 0.01, 0.1, 1],
        'gamma': [1, 0.1, 0.01, 0.001]
    },
    'gpr': {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'kernel': [gaussian_kernel]
    },
    'lasso': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'max_iter': [1000, 2000, 3000]
    },
    'elastic': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'l1_ratio': [0.2, 0.5, 0.8],
        'max_iter': [1000, 2000, 3000]
    }
}


In [5]:
def gridsearch(model,X_tr, y_tr,X_tst,y_tst):
    """
    Perform GridSearchCV for a given model

    parameters
    ----------
    model: str
        Name of model

    X_tr: np.array
        X training

    y_tr: np.array
        Y Training

    X_tst: np.array
        X test

    y_tst: np.array
        y test

    Returns
    -------
    scores: dict
        scored models
    """
    st = time.time()
    # Grab model and model parameters to perform gridsearchcv
    current_model = models[model]
    current_param_grid = param_grid[model]

    grid_search = GridSearchCV(current_model, current_param_grid, cv=5,n_jobs=-1)

    print(f'Now fitting {model}... ')

    grid_search.fit(X_tr, y_tr)

    best_model = grid_search.best_estimator_

    print(f'Completed fitting {model} in {time.time() - st:.4f} seconds. ')

    # Take the best model and evaluate using known metrics
    model=best_model
    scores = {}
    st = time.time()
    print('Now scoring model... ')
    y_tr_pred = model.predict(X_tr)
    y_tst_pred = model.predict(X_tst)

    # plt.scatter(y_tr,y_tr_pred)
    # plt.scatter(y_tst,y_tst_pred)
    # plt.scatter(y_tr, y_tr)
    # plt.scatter(y_tst,y_tst)
    # plt.show()
            
    scores['MSE_train'] = mean_squared_error(y_tr, y_tr_pred),
    scores['R2_train'] = r2_score(y_tr, y_tr_pred)
    scores['MAE_train'] = mean_absolute_error(y_tr, y_tr_pred)
    
    scores['MSE_test'] = mean_squared_error(y_tst, y_tst_pred)
    scores['R2_test'] = r2_score(y_tst, y_tst_pred)
    scores['MAE_test'] = mean_absolute_error(y_tst, y_tst_pred)
    print(f"Train R2 {scores['R2_train']:.4f}")
    print(f"Test R2 {scores['R2_test']:.4f}")
    print(f"Train MAE {scores['MAE_train']:.4f}")
    print(f"Test MAE {scores['MAE_test']:.4f}")
    print()
    return scores, best_model



In [6]:
# for r in trainsize:
#     Y=pd.read_csv(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())),'database','y.csv.gz'), compression='gzip',index_col=0)
#     X=pd.read_csv(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())),'database',f'Morgan_sub.csv.gz'), compression='gzip',index_col=0)
#     print(X.shape[1])
#     train,test=train_test_split(list(X.index), train_size=0.8,test_size=0.2,random_state=42)
#     X_train, X_test, y_train, y_test = X.loc[train].to_numpy(), X.loc[test].to_numpy(), Y.loc[train].to_numpy().flatten(), Y.loc[test].to_numpy().flatten()
    
#     scaler = MinMaxScaler
#     x_scaler = scaler((-1, 1))
#     y_scaler = scaler((-1, 1))
    
#     X_val = np.empty(X.shape)
#     y_val = np.empty(Y.shape)
#     y_train, y_test, y_val = y_train.reshape(-1, 1), y_test.reshape(-1, 1), y_val.reshape(-1, 1)
#     X_train = x_scaler.fit_transform(X_train)
#     X_test = x_scaler.transform(X_test)
#     X_val = x_scaler.transform(X_val)
#     y_train = y_scaler.fit_transform(y_train)
#     y_test = y_scaler.transform(y_test)
#     y_val = y_scaler.transform(y_val)
    
#     components=5
#     pca = PCA(n_components=components)
#     X_train, X_test = pca.fit_transform(X_train), pca.transform(X_test)
    
#     with open(f'PCA{components}_{r}_Morgan_train.bin','wb') as f:
#         joblib.dump({'X':X_train,'y':y_train},f)
#     with open(f'PCA{components}_{r}_Morgan_test.bin','wb') as f:
#         joblib.dump({'X':X_test,'y':y_test},f)
    
#     with open(f'PCA{components}_{r}_Morgan_scaler.bin','wb') as f:
#         joblib.dump(y_scaler,f)

In [7]:
scaler = MinMaxScaler
x_scaler = scaler((-1, 1))
y_scaler = scaler((-1, 1))


In [8]:
Y=pd.read_csv(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())),'database','y.csv.gz'), compression='gzip',index_col=0)
Reps=['CM', 'MACCS', 'PI', 'RDKit', 'Morgan']
      # , 'SOAP']
divisions=["AB","sub"]

test_models_out={}
for r in Reps:
    test_models_out[r]={}
    for d in divisions:
        test_models_out[r][d]={}
        print(f'Start: {r}_{d}')
        X_path=os.path.join(os.path.dirname(os.path.dirname(os.getcwd())),'database',f'{r}_{d}.csv.gz')
        if os.path.exists(X_path):
            X=pd.read_csv(X_path, compression='gzip',index_col=0)
            print(X.shape[1])
            train,test=train_test_split(list(X.index), train_size=0.8,test_size=0.2,random_state=42)
            X_train, X_test, y_train, y_test = x_scaler.fit_transform(X.loc[train].to_numpy()), x_scaler.transform(X.loc[test].to_numpy()), y_scaler.fit_transform(Y.loc[train].to_numpy()).flatten(), y_scaler.transform(Y.loc[test].to_numpy()).flatten()
            
            scaler = MinMaxScaler
            x_scaler = scaler((-1, 1))
            y_scaler = scaler((-1, 1))
            
            X_val = np.empty(X.shape)
            y_val = np.empty(Y.shape)
            y_train, y_test, y_val = y_train.reshape(-1, 1), y_test.reshape(-1, 1), y_val.reshape(-1, 1)
            X_train = x_scaler.fit_transform(X_train)
            X_test = x_scaler.transform(X_test)
            X_val = x_scaler.transform(X_val)
            y_train = y_scaler.fit_transform(y_train)
            y_test = y_scaler.transform(y_test)
            y_val = y_scaler.transform(y_val)
            
            scoring={}
            t0_init=time.perf_counter()
            for m in models.keys():
                t0=time.perf_counter()
                scores,model=gridsearch(m,X_train, y_train,X_test,y_test)
                tf=time.perf_counter()-t0
                scores['timing']=tf
                scoring[m]=scores
                print(m,tf)
            print(f"Overall {time.perf_counter()-t0_init:.2f}")
            stat_df=pd.concat([pd.DataFrame.from_dict(v).rename(index={0:k}) for k,v in scoring.items()])
            test_models_out[r][d]=stat_df
        print()
    print()

Start: CM_AB
42
Now fitting knn... 


Completed fitting knn in 0.6009 seconds. 
Now scoring model... 
Train R2 0.9850
Test R2 0.2297
Train MAE 0.0097
Test MAE 0.2098

knn 0.7072689730121056
Now fitting rfr... 


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


Completed fitting rfr in 44.2672 seconds. 
Now scoring model... 
Train R2 0.8842
Test R2 0.4024
Train MAE 0.0778
Test MAE 0.1859

rfr 44.371315402007895
Now fitting krr... 


Completed fitting krr in 12.3675 seconds. 
Now scoring model... 


Train R2 0.8578
Test R2 0.4140
Train MAE 0.0851
Test MAE 0.1816

krr 12.68994360200304
Overall 57.77

Start: CM_sub
42
Now fitting knn... 
Completed fitting knn in 0.1576 seconds. 
Now scoring model... 


Train R2 1.0000
Test R2 0.3665
Train MAE 0.0000
Test MAE 0.1898

knn 0.17358578900166322
Now fitting rfr... 


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


Completed fitting rfr in 49.2184 seconds. 
Now scoring model... 
Train R2 0.9228
Test R2 0.4210
Train MAE 0.0622
Test MAE 0.1738

rfr 49.36548210399633
Now fitting krr... 


Completed fitting krr in 12.3479 seconds. 
Now scoring model... 


Train R2 0.9725
Test R2 0.4809
Train MAE 0.0323
Test MAE 0.1696

krr 12.666206861002138
Overall 62.21


Start: MACCS_AB
167
Now fitting knn... 


Completed fitting knn in 0.1780 seconds. 
Now scoring model... 
Train R2 0.7524
Test R2 0.6082
Train MAE 0.1150
Test MAE 0.1493

knn 0.20896134599752259
Now fitting rfr... 


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


Completed fitting rfr in 17.7863 seconds. 
Now scoring model... 
Train R2 0.8816
Test R2 0.7350
Train MAE 0.0738
Test MAE 0.1161

rfr 17.814447068987647
Now fitting krr... 


Completed fitting krr in 19.5704 seconds. 
Now scoring model... 
Train R2 0.8742
Test R2 0.7426
Train MAE 0.0794
Test MAE 0.1203

krr 19.65639271400869
Overall 37.68

Start: MACCS_sub
167
Now fitting knn... 


Completed fitting knn in 0.1783 seconds. 
Now scoring model... 
Train R2 0.8606
Test R2 0.6867
Train MAE 0.0664
Test MAE 0.1271

knn 0.2086427729955176
Now fitting rfr... 


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


Completed fitting rfr in 14.5339 seconds. 
Now scoring model... 
Train R2 0.8496
Test R2 0.7465
Train MAE 0.0827
Test MAE 0.1162

rfr 14.569408988012583
Now fitting krr... 


Completed fitting krr in 20.1373 seconds. 
Now scoring model... 


Train R2 0.8359
Test R2 0.7621
Train MAE 0.0881
Test MAE 0.1166

krr 21.102658314004657
Overall 35.88


Start: PI_AB
400
Now fitting knn... 


Completed fitting knn in 0.2422 seconds. 
Now scoring model... 
Train R2 0.0413
Test R2 -0.0481
Train MAE 0.2390
Test MAE 0.2583

knn 0.294758545991499
Now fitting rfr... 


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


Completed fitting rfr in 32.4422 seconds. 
Now scoring model... 
Train R2 0.0310
Test R2 -0.0142
Train MAE 0.2362
Test MAE 0.2485

rfr 32.45411872200202
Now fitting krr... 


Completed fitting krr in 56.8570 seconds. 
Now scoring model... 


In [None]:
results_df=[]
for k,v in test_models_out.items():
    if k!='PI':
        df=v['AB'][['R2_train','R2_test']].reset_index().melt(id_vars='index')
        df['model']=len(df)*[k]
        df['Feat']=['AB']*len(df)
        df1=v['sub'][['R2_train','R2_test']].reset_index().melt(id_vars='index')
        df1['model']=len(df1)*[k]
        df1['Feat']=['sub']*len(df1)
        results_df.append(pd.concat([df,df1],axis=0))
    else:
        df=v['AB'][['R2_train','R2_test']].reset_index().melt(id_vars='index')
        df['model']=len(df)*[k]
        df['Feat']=['AB']*len(df)   
        results_df.append(df)

results_df=pd.concat(results_df)    

In [None]:
test_models_out['Morgan']['sub']

In [None]:
g.facet_data

In [None]:
g=sns.catplot(data=results_df,x='model',hue='variable',y='value',col='index',row='Feat',kind='bar',palette=sns.color_palette('Paired',5),legend=True)

# extract the matplotlib axes_subplot objects from the FacetGrid
for ax in g.axes.ravel():
    
    # iterate through the axes containers
    for c in ax.containers:
        ax.bar_label(c, fmt='{:,.2f}')
sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1))
        
plt.tight_layout()
plt.savefig("classical_funcfit.png",dpi=300,bbox_inches='tight')
plt.show()

In [None]:
def test_shap(n_feats,model):
    """
    Test dimensionality reduction using SHapely Additive Explanations (SHAP)
    
    params
    ------
    n_feats: int
        Number of best features to reduce too
        
    model: str
        Model to test

    returns
    -------
    scores: dict
        Dictionary containing evaluation metrics
    
    model: trained model
    
    """
    scaler = MinMaxScaler
    x_scaler = scaler((-1, 1))
    y_scaler = scaler((-1, 1))
    
    SHAPX=X.iloc[:,sorted_cols[-n_feats:]]
    
    X_train, X_test, y_train, y_test = x_scaler.fit_transform(SHAPX.loc[train].to_numpy()), x_scaler.transform(SHAPX.loc[test].to_numpy()), y_scaler.fit_transform(Y.loc[train].to_numpy()).flatten(), y_scaler.transform(Y.loc[test].to_numpy()).flatten()
    
    scores,model=gridsearch(model,X_train, y_train,X_test,y_test)
    with open(f'{n_feats}_SHAP_train.bin','wb') as f:
        joblib.dump({'X':X_train,'y':y_train},f)
    with open(f'{n_feats}_SHAP_test.bin','wb') as f:
        joblib.dump({'X':X_test,'y':y_test},f)
    with open(f'{n_feats}_SHAP_scaler.bin','wb') as f:
        joblib.dump(y_scaler,f)
        
    return scores,model
    

In [None]:
def test_pca(components,model):
    """
    Test dimensionality reduction using principal component analysis (PCA)
    
    params
    ------
    components: int
        Number of dimensions to reduce too
        
    model: str
        Model to test

    returns
    -------
    scores: dict
        Dictionary containing evaluation metrics

    model: trained model
    
    """
    scaler = MinMaxScaler
    x_scaler = scaler((-1, 1))
    y_scaler = scaler((-1, 1))
    
    pca = PCA(n_components=components)
    X_train, X_test, y_train, y_test = pca.fit_transform(x_scaler.fit_transform(X.loc[train])), pca.transform(x_scaler.transform(X.loc[test])), y_scaler.fit_transform(Y.loc[train].to_numpy()).flatten(), y_scaler.transform(Y.loc[test].to_numpy()).flatten()
    scores,model=gridsearch(model,X_train, y_train,X_test,y_test)
    with open(f'{components}_Morgan_train.bin','wb') as f:
        joblib.dump({'X':X_train,'y':y_train},f)
    with open(f'{components}_Morgan_test.bin','wb') as f:
        joblib.dump({'X':X_test,'y':y_test},f)
    with open(f'{components}_Morgan_scaler.bin','wb') as f:
        joblib.dump(y_scaler,f)
    return scores,model
    

In [None]:
def run_regular(model):
    """
    
    
    params
    ------    
    model: str
        Model to test

    returns
    -------
    scores: dict
        Dictionary containing evaluation metrics

    model: trained model
    
    """
    X_train, X_test, y_train, y_test = X.loc[train], X.loc[test], Y.loc[train].to_numpy(), Y.loc[test].to_numpy()
    scores,model=gridsearch(model,X_train, y_train,X_test,y_test)
    
    return scores,model
    

In [None]:
model='rfr'
n_feats=5
components=5

# # Normal
scores,model=run_regular(model)



In [None]:
X.shape

In [None]:
# # SHAP
explainer = shap.Explainer(model.predict, X_test,n_jobs=-1,max_evals=2048)
shap_values = explainer(X)
shap.plots.bar(shap_values,max_display=16)


In [None]:
reduced={}
model='rfr'
for i in [5,16]:
    sorted_cols=np.argsort(np.mean(np.abs(shap_values.values),axis=0))
    shap_scores,shap_model=test_shap(i,model)
    
    #PCA
    pca_scores,pca_model=test_pca(i,model)

    reduced[i]={'SHAP':shap_scores,'PCA':pca_scores}

In [None]:
fig,ax=plt.subplots(1,2,figsize=(10,5),sharey=True)
pal=sns.color_palette('Paired',4)
for idx,(k,v) in enumerate(reduced.items()):
    df=pd.DataFrame.from_dict(v).loc[['R2_train','R2_test']].reset_index().melt(id_vars='index') 
    if idx==0:
        ax[idx]=sns.barplot(data=df,x='variable',hue='index',y='value',palette=[pal[idx],pal[idx+1]],ax=ax[idx])
    else:
        ax[idx]=sns.barplot(data=df,x='variable',hue='index',y='value',palette=[pal[2],pal[3]],ax=ax[idx])

    for container in ax[idx].containers:
        print(container)
        ax[idx].bar_label(container, fmt='{:,.2f}')
    ax[idx].set_title(f"{k} Features")
    ax[idx].set_xlabel('Featurization')
    ax[idx].set_ylabel("R$^{2}$")
plt.tight_layout()
plt.savefig('Feat_redR2.png',dpi=300,bbox_inches='tight')