These are moleculeNet benchmarks meant to sanity check our approaches

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


def plot_confusion_matrix(y_pred, y_true, ax):
#     f, ax = plt.subplots(1,1, figsize=(10,6))

    cm = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0,1], normalize="true")
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=[0,1])
    disp.plot(cmap="Purples", ax=ax)
    
    
    
from sklearn.metrics import RocCurveDisplay, roc_curve, auc

def plot_roc_curve(y_score, y_true, ax, name):
    
    fpr, tpr, thresholds = roc_curve(y_score=y_score, y_true=y_true)
    roc_auc = auc(fpr, tpr)


    disp = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
    disp.plot(ax=ax, name=name)
    
    
    
def plot_dist_plot(y_score, y_true, ax):
    
    neg_dist = y_score[y_true == 0]
    pos_dist = y_score[y_true == 1]
    
    
    sns.distplot(neg_dist, color="blue", ax=ax)
    sns.distplot(pos_dist, color="red", ax=ax)
    

In [None]:
def bbbp_main():
    
    
    path_list = ["/g/g13/jones289/workspace/hd-cuda-master/hdpy/hdpy/results/bbbp_result.smiles-pe.pkl",
                "/g/g13/jones289/workspace/hd-cuda-master/hdpy/hdpy/results/bbbp_result.ecfp.pkl",
                "/g/g13/jones289/workspace/hd-cuda-master/hdpy/hdpy/results/bbbp_result.rp.pkl",
                "/g/g13/jones289/workspace/hd-cuda-master/hdpy/hdpy/results/bbbp_result.rf.pkl"]
    
    
    roc_f, roc_ax = plt.subplots(1,1, figsize=(10,6))

    
    for path in path_list:
        
        model = path.split(".")[-2]
        with open(path, "rb") as handle:
            data = pickle.load(handle)

            name = path.split('.')[-2]
            
            
            y_pred = data[0]['y_pred']
            y_true = data[0]['y_true']
            y_score = data[0]['eta']
            
            if model == "rf":
                y_score = y_score[:, 1]
            
            
            
            
            
#             confus_f, confus_ax = plt.subplots(1,1, figsize=(10,6))
#             plot_confusion_matrix(y_pred=y_pred, y_true=y_true, ax=confus_ax)
#             confus_f.suptitle(name)
            
            plot_roc_curve(y_score=y_score, y_true=y_true, ax=roc_ax, name=name)
            
#             dist_f, dist_ax = plt.subplots(1,1, figsize=(10,6))
#             plot_dist_plot(y_score=y_score, y_true=y_true, ax=dist_ax)
#             dist_f.suptitle(name)
            
            
    roc_ax.plot([0, 1], [0, 1], 'k--')  # random predictions curve
    
    roc_f.suptitle("Blood-Brain-Barrier Permeability Reciever Operating Curve")
    plt.savefig("bbbp_result_roc.png", dpi=600, bbox_inches="tight")


In [None]:
bbbp_main()

In [None]:
def sider_main():
    
    
    model_dict = {"model": [], "task": [], "roc-auc": []}
    for model, color in [("smiles-pe", "blue"), ("ecfp", "red"), ("rp", "green")]:

        data_path_list = list(data_p.glob(f"sider_*.{model}.pkl"))        
        

        for path in data_path_list:
            with open(path, "rb") as handle:
                
                
                data = pickle.load(handle)

                name = str(path).split('.')[-2]
                task = int(str(path).split('.')[0][-1])
            
                y_pred = data[0]['y_pred']
                y_true = data[0]['y_true']
                y_score = data[0]['eta']
        
                fpr, tpr, thresholds = roc_curve(y_score=y_score, y_true=y_true)
                roc_auc = auc(fpr, tpr)
                
                model_dict["model"].append(model)
                model_dict["task"].append(task)
                model_dict["roc-auc"].append(roc_auc)

    
    df = pd.DataFrame(model_dict)
    g = sns.catplot(data=df, x="model", y="roc-auc", kind="box", palette="deep", height=6, aspect=1.25)

    
    g.fig.suptitle("SIDER ROC-AUC Distribution Over Tasks")
    plt.savefig("sider_result_roc.png", dpi=600, bbox_inches="tight") 

In [None]:
sider_main()