In [None]:
import sys
# !{sys.executable} -m pip install blosc
import pickle
import blosc
from glob import glob
import numpy as np
import joblib
import os
import pandas as pd
import json

import time
import matplotlib.pyplot as plt

# Scikit-learn
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import normalize


from tqdm.notebook import tqdm
import seaborn as sns
from collections import Counter

# Persistent images
from Element_PI import VariancePersistv1

# SHAP
import shap

# RDKit
from rdkit.Chem.rdmolfiles import MolFromXYZFile, MolToXYZFile
from rdkit import Chem
from rdkit.Chem import rdDetermineBonds, MACCSkeys, AllChem

# DScribe
from dscribe.descriptors import ACSF, CoulombMatrix, MBTR, SOAP, CoulombMatrix
from dscribe.kernels import REMatchKernel
from ase.io import read
from dscribe.kernels import REMatchKernel

In [None]:
# Grab xyz structure names from existing and hypothetical
exists_paths=[i.replace('_A.xyz','') for i in glob(f"{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Existing/*A.xyz")]
hypo_paths=[i.replace('_A.xyz','') for i in glob(f"{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Hypothetical/*A.xyz")]

# Featurization from RDKit

In [None]:
def xyz_to_cleanmol(path,fragment):
    '''
    Clean mol object from xyz 

    params
    ------
    path: str
        Path to structure

    fragment: str
        Options: A, B, AB
        
    returns
    -------
    mol: rdkit.Chem.rdchem.Mol
        Cleaned mol object
    '''

    mol=MolFromXYZFile(path+f'_{fragment}.xyz')
    rdDetermineBonds.DetermineConnectivity(mol)
    Chem.SanitizeMol(mol)
    Chem.AssignStereochemistry(mol,cleanIt=True)
    Chem.Kekulize(mol)
    
    return mol

In [None]:
def generate_allmols(paths):
    '''
    Generate rdkit.Chem.rdchem.Mol for each fragment AB, A, and B. Ignore errors by removing that point from the data.
    
    params
    ------
    paths: list
        List of paths

    returns
    -------
        works: dict
            Dictionary with keys: AB, A, and B and values: rdkit.Chem.rdchem.Mol for each of them, respectively.
    '''
    works={}
    for i in paths:
        try:
            works[os.path.basename(i)]={'AB':xyz_to_cleanmol(i,'AB'),'A':xyz_to_cleanmol(i,'A'),'B':xyz_to_cleanmol(i,'B')}
        except:
            assert "Nice try bozo"
    return works

In [None]:
def generate_maccs(dicts,fragment):
    '''
    Generate MACCS and return a labeled dataframe

    params
    ------
    dicts: dict
        Dictionary of rdkit.Chem.rdchem.Mol objects
        
    fragment: str
        Options: A, B, AB
        
    returns
    -------
    MACCS_df: pandas.core.frame.DataFrame
        Dataframe with columns labels for the MACCS
    
    '''
    subdict={k:v[fragment] for k,v in dicts.items()}
    checkmaccs=[MACCSkeys.GenMACCSKeys(x) for x in subdict.values()]
    # Keys from http://www.mayachemtools.org/index.html
    keys=pd.read_excel('MACCS_keys_example.xlsx').drop(columns=['Unnamed: 0'])
    # Chem.Draw.MolsToGridImage(ms[:8],molsPerRow=4,subImgSize=(200,200))
    # MolToXYZFile(mol,'TRASH.xyz')
    
    MACCS_df=pd.DataFrame(list(map(list,checkmaccs))).rename(columns=keys['Value'].to_dict(),index=dict(enumerate(subdict)))   
    

    return MACCS_df

In [None]:
def generate_FP(dicts,fragment):
    '''
    Generate FP and return a labeled dataframe

    params
    ------
    dicts: dict
        Dictionary of rdkit.Chem.rdchem.Mol objects
        
    fragment: str
        Options: A, B, AB
        
    returns
    -------
    FP_df: pandas.core.frame.DataFrame
        Dataframe the FP
    
    '''
    subdict={k:v[fragment] for k,v in dicts.items()}
    fpgen = AllChem.GetRDKitFPGenerator()
    checkFP=[fpgen.GetFingerprint(x) for x in subdict.values()]
    FP_df=pd.DataFrame(list(map(list,checkFP))).rename(index=dict(enumerate(subdict)))   
    return FP_df

In [None]:
def generate_Morgan(dicts,fragment):
    '''
    Generate Morgan and return a labeled dataframe

    params
    ------
    dicts: dict
        Dictionary of rdkit.Chem.rdchem.Mol objects
        
    fragment: str
        Options: A, B, AB
        
    returns
    -------
    Morgan_df: pandas.core.frame.DataFrame
        Dataframe the Morgan
    
    '''
    subdict={k:v[fragment] for k,v in dicts.items()}
    Morgangen = AllChem.GetMorganGenerator(radius=2)
    checkMorgan=[Morgangen.GetFingerprint(x) for x in subdict.values()]
    Morgan_df=pd.DataFrame(list(map(list,checkMorgan))).rename(index=dict(enumerate(subdict)))   
    return Morgan_df

In [None]:
def generate_allMACCS(dicts):
    '''
    params
    ------
    dicts: dict
        Dictionary containing MACCS for the fragments A, B, and AB

    returns
    -------
    data_dict: dict
        Dictionary containing two sets of features: AB and sub ((A+B)-AB as described in https://doi.org/10.1021/acs.jpca.2c08340)
    '''
    AB=generate_maccs(dicts,'AB')
    A=generate_maccs(dicts,'A')
    B=generate_maccs(dicts,'B')

    names=dicts.keys()
    
    maccs_sub_df=((A+B)-AB).rename(index=dict(enumerate(names)))
    maccs_AB_df=AB.rename(index=dict(enumerate(names)))   

    data_dict={'AB':maccs_AB_df,'sub':maccs_sub_df}
    return data_dict

In [None]:
def generate_allFP(dicts):
    '''
    params
    ------
    dicts: dict
        Dictionary containing FP for the fragments A, B, and AB

    returns
    -------
    data_dict: dict
        Dictionary containing two sets of features: AB and sub ((A+B)-AB as described in https://doi.org/10.1021/acs.jpca.2c08340)
    '''
    AB=generate_FP(dicts,'AB')
    A=generate_FP(dicts,'A')
    B=generate_FP(dicts,'B')

    names=dicts.keys()
    
    FP_sub_df=((A+B)-AB).rename(index=dict(enumerate(names)))
    FP_AB_df=AB.rename(index=dict(enumerate(names)))   

    data_dict={'AB':FP_AB_df,'sub':FP_sub_df}
    return data_dict

In [None]:
def generate_allMorgan(dicts):
    '''
    params
    ------
    dicts: dict
        Dictionary containing Morgan for the fragments A, B, and AB

    returns
    -------
    data_dict: dict
        Dictionary containing two sets of features: AB and sub ((A+B)-AB as described in https://doi.org/10.1021/acs.jpca.2c08340)
    '''
    AB=generate_Morgan(dicts,'AB')
    A=generate_Morgan(dicts,'A')
    B=generate_Morgan(dicts,'B')

    names=dicts.keys()
    
    Morgan_sub_df=((A+B)-AB).rename(index=dict(enumerate(names)))
    Morgan_AB_df=AB.rename(index=dict(enumerate(names)))   

    data_dict={'AB':Morgan_AB_df,'sub':Morgan_sub_df}
    return data_dict

In [None]:
# Generate MACCS and return a labeled dataframe for the hypothetical and existing BSEs
hypo_mols=generate_allmols(hypo_paths)
exists_mols=generate_allmols(exists_paths)

In [None]:
# Generate MACCS for the existing and hypothetical data and combine them into a dictionary to be saved later
maccs_exists_data=generate_allMACCS(exists_mols)
maccs_hypo_data=generate_allMACCS(hypo_mols)

maccs_combined={'AB':pd.concat([maccs_exists_data['AB'],maccs_hypo_data['AB']]),'sub':pd.concat([maccs_exists_data['sub'],maccs_hypo_data['sub']])}

In [None]:
# Generate FP for the existing and hypothetical data and combine them into a dictionary to be saved later
FP_exists_data=generate_allFP(exists_mols)
FP_hypo_data=generate_allFP(hypo_mols)

FP_combined={'AB':pd.concat([FP_exists_data['AB'],FP_hypo_data['AB']]),'sub':pd.concat([FP_exists_data['sub'],FP_hypo_data['sub']])}

In [None]:
# Generate Morgan for the existing and hypothetical data and combine them into a dictionary to be saved later
Morgan_exists_data=generate_allMorgan(exists_mols)
Morgan_hypo_data=generate_allMorgan(hypo_mols)

Morgan_combined={'AB':pd.concat([Morgan_exists_data['AB'],Morgan_hypo_data['AB']]),'sub':pd.concat([Morgan_exists_data['sub'],Morgan_hypo_data['sub']])}

# DScribe Features

In [None]:
# Only use the data that can create mol objects so we can stay consistent
use_exists=list(maccs_exists_data['AB'].index)
use_hypo=list(maccs_hypo_data['AB'].index)

In [None]:
# Get all atomic species for ACSF
exist_species=np.unique(np.hstack([np.genfromtxt(f'{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Existing/{name}_AB.xyz',skip_header=2,dtype=str)[:,0] for name in use_exists]))
hypo_species=np.unique(np.hstack([np.genfromtxt(f'{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Hypothetical/{name}_AB.xyz',skip_header=2,dtype=str)[:,0] for name in use_hypo]))

all_species=list(np.unique(np.hstack([exist_species,hypo_species])))

# Get maximum number of atoms for CMs
max_num_atoms=max([np.genfromtxt(f'{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Existing/{name}_AB.xyz',skip_header=2,dtype=str)[:,0].size for name in use_exists])

In [None]:
# Initialize dscribe feature parameters


# Set up the SOAP descriptor with parameters:
# species, rcut, nmax, and lmax
soap = SOAP(species=all_species,r_cut=3,n_max=4,l_max=3,sigma=1.5,periodic=False)
re = REMatchKernel(metric="rbf", gamma=2, alpha=1.2, threshold=1e-8, normalize_kernel=False)
# soap = SOAP(species=all_species,r_cut=6.0,n_max=8,l_max=6,average="inner",sparse=False)

# Setting up the CM descriptor
cm = CoulombMatrix(n_atoms_max=max_num_atoms,permutation='eigenspectrum',seed=42)

In [None]:
# Hypothetical timings for SOAPs
# timing=[]
# for label in ['A','B','AB']:
#     mols = list(generate_ASE_mols([v[label] for k,v in use_exists_pathdict.items()]).values())
#     for pct in np.linspace(0.05,0.5,5):
#         t0=time.perf_counter()
#         re.create([normalize(i) for i in soap.create(mols[:int(len(mols)*pct)])])
#         done=time.perf_counter()-t0
#         prints=(label,int(len(mols)*pct),done)
#         print(prints)
#         timing.append(prints)
# timing=np.array(timing)
# for lbl in ['A','B','AB']:
#     t=timing[timing[:,0]==lbl][:,1:].astype(float)
#     x,y=t[:,0],t[:,1]
#     z = np.polyfit(x, y, 3)
#     p = np.poly1d(z)
#     px=range(len(exists_mols)+len(hypo_mols))
#     plt.plot(px,p(px),label=lbl)
#     plt.scatter(x, y)
#     print(lbl,p(px[-1])//60)
# plt.show()        

In [None]:
def generate_ASE_mols(paths):
    '''
    Generate a list of ASE ase.atoms.Atoms

    params
    ------
    paths: list
        List of paths to xyz

    returns
    -------
    structs: dict
        Dictionary of ase.atoms.Atoms for each molecule
    '''
    
    structs={'_'.join(os.path.basename(i).split('_')[:-1]):read(i) for i in paths}
    
    return structs

In [None]:
def gen_dscribe_features(mols):
    '''
    Generate DScribe features: Coulomb matrix (CM) and Smooth Overlap of Atomic Positions (SOAPs)

    params
    ------
    mols: list
        List of ASE ase.atoms.Atoms

    returns
    -------
    dscribe_dict: dict
        Dictionary containing the molecular representations for the molecule from the path
    '''
    
    t0=time.perf_counter()
    # Create CM representation
    cm_mol = cm.create(mols)
    CM_time=time.perf_counter()-t0
    print(f"CMs in {CM_time:.2f} (s)")
    
    # Create SOAP representation
    soap_mol = re.create([normalize(i) for i in soap.create(mols)])
    SP_time=time.perf_counter()-t0
    print(f"SOAPs in {SP_time:.2f} (s)")
    
    dscribe_dict={'CM':cm_mol,'SOAP':soap_mol}
    return dscribe_dict

In [None]:
# Generate DScribe features for existing molecules
use_exists_pathdict={}
for name in use_exists:
    use_exists_pathdict[name]={'AB':f'{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Existing/{name}_AB.xyz','A':f'{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Existing/{name}_A.xyz','B':f'{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Existing/{name}_B.xyz'}

exists_AB_dscribe=gen_dscribe_features(list(generate_ASE_mols([v['AB'] for k,v in use_exists_pathdict.items()]).values()))
exists_A_dscribe=gen_dscribe_features(list(generate_ASE_mols([v['A'] for k,v in use_exists_pathdict.items()]).values()))
exists_B_dscribe=gen_dscribe_features(list(generate_ASE_mols([v['B'] for k,v in use_exists_pathdict.items()]).values()))


# Generate DScribe features for hypothetical molecules
use_hypo_pathdict={}
for name in use_hypo:
    use_hypo_pathdict[name]={'AB':f'{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Hypothetical/{name}_AB.xyz','A':f'{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Hypothetical/{name}_A.xyz','B':f'{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Hypothetical/{name}_B.xyz'}

hypo_AB_dscribe=gen_dscribe_features(list(generate_ASE_mols([v['AB'] for k,v in use_hypo_pathdict.items()]).values()))
hypo_A_dscribe=gen_dscribe_features(list(generate_ASE_mols([v['A'] for k,v in use_hypo_pathdict.items()]).values()))
hypo_B_dscribe=gen_dscribe_features(list(generate_ASE_mols([v['B'] for k,v in use_hypo_pathdict.items()]).values()))

# Combine existing and hypothetical dscribe data 
joined_AB_dscribe={k:pd.concat([pd.DataFrame(v,index=use_exists),pd.DataFrame(hypo_AB_dscribe[k],index=use_hypo)]) for k,v in exists_AB_dscribe.items()}
joined_A_dscribe={k:pd.concat([pd.DataFrame(v,index=use_exists),pd.DataFrame(hypo_A_dscribe[k],index=use_hypo)]) for k,v in exists_A_dscribe.items()}
joined_B_dscribe={k:pd.concat([pd.DataFrame(v,index=use_exists),pd.DataFrame(hypo_B_dscribe[k],index=use_hypo)]) for k,v in exists_B_dscribe.items()}

In [None]:
def generate_allDScribe():
    data_dict={}
    for i in ['CM','SOAP']:
        data_dict[i]={'AB':joined_AB_dscribe[i],'sub':(joined_A_dscribe[i]+joined_B_dscribe[i])-joined_AB_dscribe[i]}
    return data_dict

In [None]:
dscribe_data=generate_allDScribe()

# Persistent Images

In [None]:
# pixelsx = 150 # Don't make too large 150
# pixelsy = 150 # 150
pixelsx = 20 # Don't make too large 150
pixelsy = 20 # 150
spread = 0.06
Max = 2.5 # 2.5

X=np.zeros((len(use_exists)+len(use_hypo),(pixelsx*pixelsy)))

overIDX=0
for idx,i in enumerate(use_exists):
    AB=f"{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Existing/{i}_AB.xyz"
    PI_AB=VariancePersistv1(f'{AB}', pixelx=pixelsx, pixely=pixelsy,myspread=spread, myspecs={"maxBD": Max, "minBD":  -0.1}, showplot=False)
    X[idx,:]=PI_AB
    overIDX+=1
print(overIDX)
for idx,i in enumerate(use_hypo):
    AB=f"{os.path.dirname(os.getcwd())}/database/bse49-main/Geometries/Hypothetical/{i}_AB.xyz"
    PI_AB=VariancePersistv1(f'{AB}', pixelx=pixelsx, pixely=pixelsy,myspread=spread, myspecs={"maxBD": Max, "minBD":  -0.1}, showplot=False)
    X[idx,:]=PI_AB
    overIDX+=1
print(overIDX)

PI_DF=pd.DataFrame(X,index=use_exists+use_hypo)

PI_combined={'AB':PI_DF,'sub':None}

# Save molecular representations

In [None]:
molecular_representations={}
molecular_representations["MACCS"]=maccs_combined
molecular_representations["RDKit"]=FP_combined
molecular_representations["Morgan"]=Morgan_combined
molecular_representations["PI"]=PI_combined

for k,v in dscribe_data.items():
    molecular_representations[k]=v


In [None]:
for k,v in molecular_representations.items():
    print(k,v.keys())

# Save y

In [None]:
# Currate y (target) data
BSE49_Existing=pd.read_csv('./bse49-main/BSE49_Existing.org',delimiter='|',header=None,index_col=1).dropna(axis=1)[[7,8]]
BSE49_Hypothetical=pd.read_csv('./bse49-main/BSE49_Hypothetical.org',delimiter='|',header=None,index_col=1).dropna(axis=1)[[7,8]]
BSE_ydf=pd.concat([BSE49_Existing,BSE49_Hypothetical])
y_df=pd.DataFrame.from_dict({list(v.values())[0].strip().replace('_AB',''):list(v.values())[1] for v in BSE_ydf.to_dict(orient='index').values()},orient='index').rename(columns={0:'BSE'}).loc[use_exists+use_hypo]

# Save all data

In [None]:
savedata={'X':molecular_representations,'y':y_df}

In [None]:
for k,v in savedata['X'].items():
    print(k)
    for k1,v1 in v.items():    
        if v1 is not None:
            v1.to_csv(f'{k}_{k1}.csv.gz', compression='gzip')


In [None]:
savedata['y'].to_csv(f'y.csv.gz', compression='gzip')

In [None]:
# with open('currated_reps.bin','wb') as f:
#     joblib.dump(savedata,f)

In [None]:
def check_sparsity(A):
    '''
    Check the sparsity of a given feature matrix

    params
    ------
    A: numpy.ndarray

    returns
    -------
    sparse: float
    
    '''

    sparse=1-(np.count_nonzero(A)/A.size)
    return sparse