In [None]:
import sys
# !{sys.executable} -m pip install shap
from glob import glob
import numpy as np
import joblib
import os
import pandas as pd
import json

import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from tqdm.notebook import tqdm
import seaborn as sns
from collections import Counter

# SHAP
import shap

In [None]:


gaussian_kernel = RBF()

# remove basic linear models (ridge, lasso, elastic) and expensive models (gpr)

models = {
    'ridge': Ridge(),
    'lasso': Lasso(),
    'elastic': ElasticNet(),
    'knn': KNeighborsRegressor(),
    'rfr': RandomForestRegressor(),
    'grad': GradientBoostingRegressor(),
    'svr': SVR(),
    'krr': KernelRidge(),
    'gpr': GaussianProcessRegressor()
}

param_grid = {
    'ridge': {
        'alpha': [0.001, 0.01, 0.1, 1, 10, 50, 100, 1000]
    },
    'knn': {
        'n_neighbors': [1,5,10],
        'weights': ['uniform', 'distance']
    },
    'rfr': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    },
    'grad': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.1, 0.01, 0.001],
        'max_depth': [3, 5, 7]
    },
    'svr': {
        'kernel': ['linear', 'rbf'],
        'C': [0.1, 1, 10],
        'epsilon': [0.1, 0.01, 0.001]
    },
    'krr': {
        'kernel': ['linear', 'laplacian', 'rbf'],
        'alpha': [0.001, 0.01, 0.1, 1],
        'gamma': [1, 0.1, 0.01, 0.001]
    },
    'gpr': {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'kernel': [gaussian_kernel]
    },
    'lasso': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'max_iter': [1000, 2000, 3000]
    },
    'elastic': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'l1_ratio': [0.2, 0.5, 0.8],
        'max_iter': [1000, 2000, 3000]
    }
}


In [None]:
def run_regular(model):
    """
    
    
    params
    ------    
    model: str
        Model to test

    returns
    -------
    scores: dict
        Dictionary containing evaluation metrics

    model: trained model
    
    """
    scaler = MinMaxScaler
    x_scaler = scaler((-1, 1))
    y_scaler = scaler((-1, 1))    
    X_train, X_test, y_train, y_test = x_scaler.fit_transform(X.loc[train]), x_scaler.transform(X.loc[test]), y_scaler.fit_transform(Y.loc[train].to_numpy()).flatten(), y_scaler.transform(Y.loc[test].to_numpy()).flatten()
    scores,model=gridsearch(model,X_train, y_train,X_test,y_test,scaler)
    
    return scores,model
    

In [None]:
def test_pca(components,model):
    """
    Test dimensionality reduction using principal component analysis (PCA)
    
    params
    ------
    components: int
        Number of dimensions to reduce too
        
    model: str
        Model to test

    returns
    -------
    scores: dict
        Dictionary containing evaluation metrics

    model: trained model
    
    """
    scaler = MinMaxScaler
    x_scaler = scaler((-1, 1))
    y_scaler = scaler((-1, 1))
    
    pca = PCA(n_components=components)
    X_train, X_test, y_train, y_test = pca.fit_transform(x_scaler.fit_transform(X.loc[train])), pca.transform(x_scaler.transform(X.loc[test])), y_scaler.fit_transform(Y.loc[train].to_numpy()).flatten(), y_scaler.transform(Y.loc[test].to_numpy()).flatten()
    scores,model=gridsearch(model,X_train, y_train,X_test,y_test,y_scaler)
    # with open(f'{components}_Morgan_train.bin','wb') as f:
    #     joblib.dump({'X':X_train,'y':y_train},f)
    # with open(f'{components}_Morgan_test.bin','wb') as f:
    #     joblib.dump({'X':X_test,'y':y_test},f)
    # with open(f'{components}_Morgan_scaler.bin','wb') as f:
    #     joblib.dump(y_scaler,f)
    return scores,model
    

In [None]:
def test_shap(n_feats,model):
    """
    Test dimensionality reduction using SHapely Additive Explanations (SHAP)
    
    params
    ------
    n_feats: int
        Number of best features to reduce too
        
    model: str
        Model to test

    returns
    -------
    scores: dict
        Dictionary containing evaluation metrics
    
    model: trained model
    
    """
    scaler = MinMaxScaler
    x_scaler = scaler((-1, 1))
    y_scaler = scaler((-1, 1))
    
    SHAPX=X.iloc[:,sorted_cols[-n_feats:]]
    
    X_train, X_test, y_train, y_test = x_scaler.fit_transform(SHAPX.loc[train].to_numpy()), x_scaler.transform(SHAPX.loc[test].to_numpy()), y_scaler.fit_transform(Y.loc[train].to_numpy()).flatten(), y_scaler.transform(Y.loc[test].to_numpy()).flatten()
    
    scores,model=gridsearch(model,X_train, y_train,X_test,y_test,y_scaler)
    # with open(f'{n_feats}_SHAP_train.bin','wb') as f:
    #     joblib.dump({'X':X_train,'y':y_train},f)
    # with open(f'{n_feats}_SHAP_test.bin','wb') as f:
    #     joblib.dump({'X':X_test,'y':y_test},f)
    # with open(f'{n_feats}_SHAP_scaler.bin','wb') as f:
    #     joblib.dump(y_scaler,f)
        
    return scores,model
    

In [None]:
def gridsearch(model,X_tr, y_tr,X_tst,y_tst,scaler):
    """
    Perform GridSearchCV for a given model

    parameters
    ----------
    model: str
        Name of model

    X_tr: np.array
        X training

    y_tr: np.array
        Y Training

    X_tst: np.array
        X test

    y_tst: np.array
        y test

    Returns
    -------
    scores: dict
        scored models
    """
    st = time.time()
    # Grab model and model parameters to perform gridsearchcv
    current_model = models[model]
    current_param_grid = param_grid[model]

    grid_search = GridSearchCV(current_model, current_param_grid, cv=5,n_jobs=12,verbose=10000)

    print(f'Now fitting {model}... ')

    grid_search.fit(X_tr, y_tr)

    best_model = grid_search.best_estimator_

    print(f'Completed fitting {model} in {time.time() - st:.4f} seconds. ')

    # Take the best model and evaluate using known metrics
    model=best_model
    scores = {}
    st = time.time()
    print('Now scoring model... ')
    y_tr_pred = model.predict(X_tr)
    y_tst_pred = model.predict(X_tst)

    y_tr = scaler.inverse_transform(y_tr.reshape(-1,1)).flatten()
    y_tr_pred = scaler.inverse_transform(y_tr_pred.reshape(-1,1)).flatten()
    y_tst = scaler.inverse_transform(y_tst.reshape(-1,1)).flatten()
    y_tst_pred = scaler.inverse_transform(y_tst_pred.reshape(-1,1)).flatten()
    # plt.scatter(y_tr,y_tr_pred)
    # plt.scatter(y_tst,y_tst_pred)
    # plt.scatter(y_tr, y_tr)
    # plt.scatter(y_tst,y_tst)
    # plt.show()
            
    scores['MSE_train'] = mean_squared_error(y_tr, y_tr_pred),
    scores['R2_train'] = r2_score(y_tr, y_tr_pred)
    scores['MAE_train'] = mean_absolute_error(y_tr, y_tr_pred)
    
    scores['MSE_test'] = mean_squared_error(y_tst, y_tst_pred)
    scores['R2_test'] = r2_score(y_tst, y_tst_pred)
    scores['MAE_test'] = mean_absolute_error(y_tst, y_tst_pred)
    print(f"Train R2 {scores['R2_train']:.4f}")
    print(f"Test R2 {scores['R2_test']:.4f}")
    print(f"Train MAE {scores['MAE_train']:.4f}")
    print(f"Test MAE {scores['MAE_test']:.4f}")
    print()
    return scores, best_model



In [None]:
Y=pd.read_csv(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())),'database','y.csv.gz'), compression='gzip',index_col=0)

X_path=os.path.join(os.path.dirname(os.path.dirname(os.getcwd())),'database',f'Morgan_sub.csv.gz')
if os.path.exists(X_path):
    X=pd.read_csv(X_path, compression='gzip',index_col=0)

train,test=train_test_split(list(X.index), train_size=0.8,test_size=0.2,random_state=42)

X_train, X_test, y_train, y_test = X.loc[train], X.loc[test], Y.loc[train].to_numpy().flatten(), Y.loc[test].to_numpy().flatten()

In [None]:
X_train.shape

In [None]:
scaler = MinMaxScaler
x_scaler = scaler((-1, 1))
y_scaler = scaler((-1, 1))

model='rfr'
n_feats=5
components=5

# # Normal
scores,model=run_regular(model)



In [None]:
scores_df = pd.DataFrame(scores).rename(index={0:'full'})
scores_df['features'] = X_train.shape[1]

In [None]:
# # SHAP
explainer = shap.Explainer(model.predict, X_test,n_jobs=12,max_evals=X.shape[1]*2 + 1)
shap_values = explainer(X_test)
shap.plots.bar(shap_values,max_display=16)


In [None]:
reduced={}
model='rfr'
for i in [5,16]:
    sorted_cols=np.argsort(np.mean(np.abs(shap_values.values),axis=0))
    shap_scores,shap_model=test_shap(i,model)
    
    #PCA
    pca_scores,pca_model=test_pca(i,model)

    reduced[i]={'SHAP':shap_scores,'PCA':pca_scores}

In [None]:
scores_df

In [None]:
dfredfmt = []
for k,v in reduced.items():
    dfred = pd.DataFrame(v).T
    dfred['features']=k
    dfred.index = ['_'.join([j[0],str(j[1])]) for j in (list(zip(dfred.index,len(dfred.index)*[k])))]
    dfredfmt.append(dfred)

In [None]:
data_df = pd.concat(dfredfmt+[scores_df])

In [None]:
data_df.columns

In [None]:
data_df['set'] = [i.split('_')[0] if len(i.split('_'))>1 else i for i in data_df.index]

In [None]:
MAE_melt = data_df.rename(columns={'MAE_train':"Train",'MAE_test':'Test'}).melt(id_vars=['set','features'],value_vars=['Train','Test'])

R2_melt = data_df.rename(columns={'R2_train':"Train",'R2_test':'Test'}).melt(id_vars=['set','features'],value_vars=['Train','Test'])

In [None]:
g = sns.catplot(data=R2_melt,x='set',y='value',col='variable',hue='features',kind='bar',palette=sns.color_palette('Paired',6))
g.set_axis_labels("Set", "R$^{2}$")
g.set_titles("{col_name}")
g._legend.set_bbox_to_anchor((1.06, 0.5))  # Position legend to the right
g._legend.set_frame_on(True)

# Add labels over each bar
for ax in g.axes.flat:
    for bar in ax.patches:
        height = bar.get_height()
        if height!=0:
            ax.text(
                bar.get_x() + bar.get_width() / 2,
                height,
                f'{height:.2f}',  # Format the label as desired
                ha='center',
                va='bottom'
            )

plt.tight_layout()
plt.savefig(os.path.join(os.path.expanduser("~"),'qregress/images/BSE/classical_features_MAE.png'),dpi=300,bbox_inches='tight')

In [None]:
g = sns.catplot(data=MAE_melt,x='set',y='value',col='variable',hue='features',kind='bar',palette=sns.color_palette('Paired',6))
g.set_axis_labels("Set", "Mean Absolute Error (kcal/mol)")
g.set_titles("{col_name}")
# Add labels over each bar
# Add labels over each bar
for ax in g.axes.flat:
    for bar in ax.patches:
        height = bar.get_height()
        if height!=0:
            ax.text(
                bar.get_x() + bar.get_width() / 2,
                height,
                f'{height:.2f}',  # Format the label as desired
                ha='center',
                va='bottom'
            )

plt.tight_layout()
plt.savefig(os.path.join(os.path.expanduser("~"),'qregress/images/BSE/classical_features_MAE.png'),dpi=300,bbox_inches='tight')