In [None]:
import sys
# !{sys.executable} -m pip install Ripser Cython
from glob import glob
import numpy as np
import joblib
import os
import pandas as pd
import json

import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


from tqdm.notebook import tqdm
import seaborn as sns
from collections import Counter
from Element_PI import VariancePersistv1

# SHAP
import shap

from chemml.chem import Molecule
from chemml.chem import RDKitFingerprint


In [None]:
# pixelsx = 150 # Don't make too large 150
# pixelsy = 150 # 150
pixelsx = 20 # Don't make too large 150
pixelsy = 20 # 150
spread = 0.06
Max = 2.5 # 2.5



In [None]:
existsY=joblib.load("/Users/grierjones/qregress/database/processed/intermediate/BSE49_existing.bin")['BSE']
hypoY=joblib.load("/Users/grierjones/qregress/database/processed/intermediate/BSE49_Hypothetical.bin")['BSE']

In [None]:
X=np.zeros((len(existsY)+len(hypoY),(pixelsx*pixelsy)))
y=np.zeros((len(existsY)+len(hypoY),))

In [None]:
overIDX=0
for idx,i in enumerate(pd.read_csv('/Users/grierjones/qregress/database/processed/intermediate/existing_molsA.csv')['XYZ']):
    # print(existsY.loc[idx],os.path.basename(i).replace('_A.xyz',''))
    B=f"/Users/grierjones/qregress/database/bse49-main/Geometries/Existing/{os.path.basename(i).replace('_A.xyz','_B.xyz')}"
    A=f"/Users/grierjones/qregress/database/bse49-main/Geometries/Existing/{os.path.basename(i)}"
    AB=f"/Users/grierjones/qregress/database/bse49-main/Geometries/Existing/{os.path.basename(i).replace('_A.xyz','_AB.xyz')}"
    # PI_A=VariancePersistv1(f'{A}', pixelx=pixelsx, pixely=pixelsy,myspread=spread, myspecs={"maxBD": Max, "minBD":  -0.1}, showplot=False)
    # PI_B=VariancePersistv1(f'{B}', pixelx=pixelsx, pixely=pixelsy,myspread=spread, myspecs={"maxBD": Max, "minBD":  -0.1}, showplot=False)
    PI_AB=VariancePersistv1(f'{AB}', pixelx=pixelsx, pixely=pixelsy,myspread=spread, myspecs={"maxBD": Max, "minBD":  -0.1}, showplot=False)
    X[idx,:]=PI_AB
    y[idx]=existsY.loc[idx]
    overIDX+=1
print(overIDX)
for idx,i in enumerate(pd.read_csv('/Users/grierjones/qregress/database/processed/intermediate/hypothetical_molsA.csv')['XYZ']):
    # print(hypoY.loc[idx],os.path.basename(i).replace('_A.xyz',''))
    B=f"/Users/grierjones/qregress/database/bse49-main/Geometries/Hypothetical/{os.path.basename(i).replace('_A.xyz','_B.xyz')}"
    A=f"/Users/grierjones/qregress/database/bse49-main/Geometries/Hypothetical/{os.path.basename(i)}"
    AB=f"/Users/grierjones/qregress/database/bse49-main/Geometries/Hypothetical/{os.path.basename(i).replace('_A.xyz','_AB.xyz')}"
    # PI_A=VariancePersistv1(f'{A}', pixelx=pixelsx, pixely=pixelsy,myspread=spread, myspecs={"maxBD": Max, "minBD":  -0.1}, showplot=False)
    # PI_B=VariancePersistv1(f'{B}', pixelx=pixelsx, pixely=pixelsy,myspread=spread, myspecs={"maxBD": Max, "minBD":  -0.1}, showplot=False)
    PI_AB=VariancePersistv1(f'{AB}', pixelx=pixelsx, pixely=pixelsy,myspread=spread, myspecs={"maxBD": Max, "minBD":  -0.1}, showplot=False)
    X[idx,:]=PI_AB
    y[idx]=hypoY.loc[idx]
    overIDX+=1
print(overIDX)

In [None]:
X.shape

In [None]:
trainsize=[ 0.1,0.3, 0.5, 0.7,0.8]

In [None]:


gaussian_kernel = RBF()

models = {
    'ridge': Ridge(),
    'lasso': Lasso(),
    'elastic': ElasticNet(),
    'knn': KNeighborsRegressor(),
    'rfr': RandomForestRegressor(),
    'grad': GradientBoostingRegressor(),
    'svr': SVR(),
    'krr': KernelRidge(),
    'gpr': GaussianProcessRegressor()
}

param_grid = {
    'ridge': {
        'alpha': [0.001, 0.01, 0.1, 1, 10, 50, 100, 1000]
    },
    'knn': {
        'n_neighbors': range(1, 10),
        'weights': ['uniform', 'distance']
    },
    'rfr': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    },
    'grad': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.1, 0.01, 0.001],
        'max_depth': [3, 5, 7]
    },
    'svr': {
        'kernel': ['linear', 'rbf'],
        'C': [0.1, 1, 10],
        'epsilon': [0.1, 0.01, 0.001]
    },
    'krr': {
        'kernel': ['linear', 'poly', 'rbf'],
        'alpha': [0.001, 0.01, 0.1, 1],
        'gamma': [1, 0.1, 0.01, 0.001]
    },
    'gpr': {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'kernel': [gaussian_kernel]
    },
    'lasso': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'max_iter': [1000, 2000, 3000]
    },
    'elastic': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'l1_ratio': [0.2, 0.5, 0.8],
        'max_iter': [1000, 2000, 3000]
    }
}


In [None]:
def gridsearch(model,X_tr, y_tr,X_tst,y_tst):
    """
    Perform GridSearchCV for a given model

    parameters
    ----------
    model: str
        Name of model

    X_tr: np.array
        X training

    y_tr: np.array
        Y Training

    X_tst: np.array
        X test

    y_tst: np.array
        y test

    Returns
    -------
    scores: dict
        scored models
    """
    print(X_tr.shape, y_tr.shape,X_tst.shape,y_tst.shape)
    st = time.time()
    # Grab model and model parameters to perform gridsearchcv
    current_model = models[model]
    current_param_grid = param_grid[model]

    grid_search = GridSearchCV(current_model, current_param_grid, cv=5,n_jobs=-1)

    print(f'Now fitting {model}... ')

    grid_search.fit(X_tr, y_tr)

    best_model = grid_search.best_estimator_

    print(f'Completed fitting {model} in {time.time() - st} seconds. ')

    # Take the best model and evaluate using known metrics
    model=best_model
    scores = {}
    st = time.time()
    print('Now scoring model... ')
    y_tr_pred = model.predict(X_tr)
    y_tst_pred = model.predict(X_tst)

    plt.scatter(y_tr,y_tr_pred)
    plt.scatter(y_tst,y_tst_pred)
    plt.scatter(y_tr, y_tr)
    plt.scatter(y_tst,y_tst)
    plt.show()
            
    scores['MSE_train'] = mean_squared_error(y_tr, y_tr_pred),
    scores['R2_train'] = r2_score(y_tr, y_tr_pred)
    scores['MAE_train'] = mean_absolute_error(y_tr, y_tr_pred)
    scores['MSE_test'] = mean_squared_error(y_tst, y_tst_pred)
    scores['R2_test'] = r2_score(y_tst, y_tst_pred)
    scores['MAE_test'] = mean_absolute_error(y_tst, y_tst_pred)

    return scores, best_model



In [None]:
# performance={}
# for m in models:
#     performance[m]={}
#     for ts in trainsize:
#         X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=ts,random_state=42)
#         scores, best_model = gridsearch(m,X_train, y_train,X_test,y_test)
#         performance[m][ts]=scores
#         print(ts,scores)

In [None]:
# dfmelt=[]
# for k,v in performance.items():
#     df=pd.DataFrame.from_dict(v).loc[['MAE_train','MAE_test']].T
#     df.columns=[df.columns,[k,k]]
#     dfmelt.append(df.melt(value_vars=df.columns.tolist(),ignore_index=False))
# dfmelt=pd.concat(dfmelt)

In [None]:
# sns.barplot(dfmelt.loc[0.8],hue='variable_0',x='variable_1',y='value',palette=sns.color_palette("Paired",2))

In [None]:
# ()

# g=sns.catplot(
#     dfmelt.reset_index(), kind="bar",
#     x="index", y="value", col="variable_1", hue='variable_0',
#     height=4, aspect=1, col_wrap=5,palette=sns.color_palette("Paired",2)
# )
# g.set_axis_labels("Training Set Ratio", "R$^{2}$")
# g.set_titles("{col_var}={col_name}")
# sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1))
# # g.set(ylim=(0, 1))

# CHEMML TEST

In [None]:
morgan_fp = RDKitFingerprint(fingerprint_type='morgan', vector='bit', n_bits=1024, radius=3)
MACCS_fp = RDKitFingerprint(fingerprint_type='maccs', vector='bit', n_bits=167)


In [None]:
from rdkit.Chem.rdmolfiles import MolFromXYZFile, MolToXYZFile
from rdkit import Chem
from rdkit.Chem import rdDetermineBonds

mol=MolFromXYZFile('/Users/grierjones/qregress/database/bse49-main/Geometries/Existing/C-C_1-Butanol_B.xyz')
Chem.SanitizeMol(mol)
rdDetermineBonds.DetermineConnectivity(mol)
MolToXYZFile(mol,'trashB.xyz')


mol=MolFromXYZFile('/Users/grierjones/qregress/database/bse49-main/Geometries/Existing/C-C_1-Butanol_A.xyz')
Chem.SanitizeMol(mol)
rdDetermineBonds.DetermineConnectivity(mol)
MolToXYZFile(mol,'trashA.xyz')

mol=MolFromXYZFile('/Users/grierjones/qregress/database/bse49-main/Geometries/Existing/C-C_1-Butanol_AB.xyz')
Chem.SanitizeMol(mol)
rdDetermineBonds.DetermineConnectivity(mol)
MolToXYZFile(mol,'trashAB.xyz')

In [None]:
mol = Molecule('/Users/grierjones/qregress/database/bse49-main/Geometries/Existing/C-C_1-Butanol_AB.xyz', 'xyz')


In [None]:
# mol.to_smiles(kekuleSmiles=True)

In [None]:
mol