In [1]:
#imports
import time
import qcportal as ptl
import numpy as np
import matplotlib.pyplot as plt
from fragmenter.utils import HARTREE_2_KJMOL
from fragmenter import chemi
from simtk import unit
import os, sys
from fragmenter import chemi
import cmiles
from openeye import oechem
import qcportal as ptl
from openforcefield.topology import Molecule, Topology
from openforcefield.typing.engines.smirnoff import ForceField
from scipy import stats
import arch.bootstrap
import fragmenter 



# of points selected =  4


In [2]:
TD_datasets = ['Fragment Stability Benchmark',
 'Fragmenter paper',
 'OpenFF DANCE 1 eMolecules t142 v1.0',
 'OpenFF Fragmenter Validation 1.0',
 'OpenFF Full TorsionDrive Benchmark 1',
 'OpenFF Gen 2 Torsion Set 1 Roche',
 'OpenFF Gen 2 Torsion Set 1 Roche 2',
 'OpenFF Gen 2 Torsion Set 2 Coverage',
 'OpenFF Gen 2 Torsion Set 2 Coverage 2',
 'OpenFF Gen 2 Torsion Set 3 Pfizer Discrepancy',
 'OpenFF Gen 2 Torsion Set 3 Pfizer Discrepancy 2',
 'OpenFF Gen 2 Torsion Set 4 eMolecules Discrepancy',
 'OpenFF Gen 2 Torsion Set 4 eMolecules Discrepancy 2',
 'OpenFF Gen 2 Torsion Set 5 Bayer',
 'OpenFF Gen 2 Torsion Set 5 Bayer 2',
 'OpenFF Gen 2 Torsion Set 6 Supplemental',
 'OpenFF Gen 2 Torsion Set 6 Supplemental 2',
 'OpenFF Group1 Torsions',
 'OpenFF Group1 Torsions 2',
 'OpenFF Group1 Torsions 3',
 'OpenFF Primary Benchmark 1 Torsion Set',
 'OpenFF Primary Benchmark 2 Torsion Set',
 'OpenFF Primary TorsionDrive Benchmark 1',
 'OpenFF Rowley Biaryl v1.0',
 'OpenFF Substituted Phenyl Set 1',
 'OpenFF-benchmark-ligand-fragments-v1.0',
 'Pfizer Discrepancy Torsion Dataset 1',
 'SMIRNOFF Coverage Torsion Set 1',
 'SiliconTX Torsion Benchmark Set 1',
 'TorsionDrive Paper']

In [3]:
def oeb2oemol(oebfile):
    """
    Takes in oebfile and generates oemolList
        Parameters
        ----------
        oebfile : String
            Title of an oeb file
        Returns
        -------
        mollist : List of objects
            List of OEMols in the .oeb file
            
    """
    ifs = oechem.oemolistream(oebfile)
    mollist = []

    for mol in ifs.GetOEGraphMols():
            mollist.append(oechem.OEGraphMol(mol))

    return mollist


def compute_r_ci(wbos, max_energies):
    return (stats.linregress(wbos, max_energies)[2])**2


def plot_interactive(fileList, t_id=None):
    """
    Takes in a list of oeb files and plots wbo vs torsion barrier, combining all the datasets and plotting by each tid in the combined dataset
    
    Note: ***Plot is interactive (or returns chemical structures) only for the last usage
    
    Parameters
    ----------
    fileList: list of strings
    each string is a oeb file name
    Eg. ['rowley.oeb'] or ['rowley.oeb', 'phenyl.oeb']
    
    t_id: str
    torsion id, eg., 't43'
    """
    import plotly.express as px
    from jupyter_dash import JupyterDash
    import dash_core_components as dcc
    import dash_html_components as html
    import pandas as pd
    import plotly.graph_objects as go
    from dash.dependencies import Input, Output
    from rdkit import Chem
    from rdkit.Chem.Draw import MolsToGridImage
    import base64
    from io import BytesIO

    molList = []
    for fileName in fileList:
        molList.extend(oeb2oemol(fileName))

    df = pd.DataFrame(columns = ['tid', 'tb', 'wbo', 'cmiles', 'TDindices']) 

    for m in molList:
        tid=m.GetData("IDMatch")
        df = df.append({'tid': tid, 
                        'tb': m.GetData("TB"),
                        'wbo' : m.GetData("WBO"),
                        'cmiles' : m.GetData("cmiles"),
                        'TDindices' : m.GetData("TDindices")}, 
                        ignore_index = True)
    if(t_id):
        print(t_id, "only")
        df = df[df.tid == t_id]

    colors = fragmenter.chemi._KELLYS_COLORS

    fig = go.Figure({'layout' : go.Layout(height=400,
            xaxis={'title': 'Wiberg Bond Order'},
            yaxis={'title': 'Torsion barrier in KJ/mol'},
            margin={'l': 40, 'b': 40, 't': 10, 'r': 10},
            legend={'x': 1, 'y': 1},
            hovermode=False,
            dragmode='select')})

    count = 0
    for tid in df.tid.unique():
        x = df[df.tid == tid].wbo
        y = df.loc[x.index].tb
        fig.add_scatter(x=x,
                        y=y,
                        mode="markers", 
                        name=tid, 
                        marker_color=colors[count])

        slope, intercept, r_value, p_value, std_err =    stats.linregress(x, y)
        print("tid: ", tid, "r_value: ", r_value, 
              "slope: ", slope, "intercept: ", intercept)
        ci_r_value = arch.bootstrap.IIDBootstrap(
            np.asarray(x), 
            np.asarray(y)).conf_int(compute_r_ci, 1000)
        CI_95 = 1.96*std_err
        fig.add_traces(go.Scatter(
            x=np.unique(x), 
            y=np.poly1d([slope, intercept])(np.unique(x)), 
            showlegend=False, mode ='lines', marker_color=colors[count]))
        count += 1

    graph_component = dcc.Graph(id="graph_id", figure=fig)
    image_component = html.Img(id="structure-image")

    app = JupyterDash(__name__) 

    app.layout = html.Div([
        html.Div([graph_component]), 
        html.Div([image_component])])

    @app.callback(
        Output('structure-image', 'src'),
        [Input('graph_id', 'selectedData')])
    def display_selected_data(selectedData):
        max_structs = 12
        structs_per_row = 1
        empty_plot = "data:image/gif;base64,R0lGODlhAQABAAAAACwAAAAAAQABAAA="
        if selectedData:
            if len(selectedData['points']) == 0:
                return empty_plot
            print("# of points selected = ", len(selectedData['points']))
            xval = [x['x'] for x in selectedData['points']]
            yval = [x['y'] for x in selectedData['points']]
            match_df = df[df['wbo'].isin(xval)]
            smiles_list = list(match_df.cmiles)
            name_list = list(match_df.tid)
            name_list = []
            
            for i in range(len(selectedData['points'])):
                indices_tup = match_df.iloc[i].TDindices
                tid = match_df.iloc[i].tid
                tor_bar = match_df.iloc[i].tb
                wbo_tor = match_df.iloc[i].wbo
                cmiles_str = match_df.iloc[i].cmiles
                tmp = [str(tid), ':', 'TDindices [', str(indices_tup[0]+1),
                       str(indices_tup[1]+1), str(indices_tup[2]+1), 
                       str(indices_tup[3]+1), ']', 
                       'wbo:', str('%.2f'%(wbo_tor)), 
                       'TB:', str('%.2f'%(tor_bar)), 'KJ/mol]']
                name_list.append(' '.join(tmp))
            mol_list = [Chem.MolFromSmiles(x) for x in smiles_list]
            img = MolsToGridImage(mol_list[0:max_structs], 
                                  subImgSize=(400, 400), 
                                  molsPerRow=structs_per_row, 
                                  legends=name_list)
            buffered = BytesIO()
            img.save(buffered, format="PNG", legendFontSize=40)
            encoded_image = base64.b64encode(buffered.getvalue())
            src_str = 'data:image/png;base64,{}'.format(encoded_image.decode())
        else:
            return empty_plot
        return src_str

    if __name__ == '__main__':   
        app.run_server(mode='inline', port=8070, debug=True)

    return fig


In [4]:
TD_working_oeb = [x+'.oeb' for x in TD_datasets]
all_t43 = plot_interactive(TD_working_oeb, t_id='t43')

t43 only
tid:  t43 r_value:  0.82272786065585 slope:  516.1768094538851 intercept:  -493.1300638906191
