In [1]:
import os, sys, itertools, functools
import warnings
from collections import ChainMap
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.externals import joblib
from scipy import stats
import matplotlib
matplotlib.use("pdf")
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns
from MLA import calc_metrics, q2_f3_score
from prep import filter_nd
%matplotlib inline


In [2]:
data_path = Path('../../raspd_ml_resources/datasources/RASPD_OLD/').resolve()


In [3]:
all_dataset_names={'wang/BACE_RASPD_old.txt' : 'BACE',
       'D3R/CDK2-CyclinA_RASPD_old.txt': 'CDK2-Cyclin A',
       'D3R/CDK2-Kinase_RASPD_old.txt': 'CDK2 Kinase',
       'wang/CDK2_RASPD_old.txt': 'CDK2',
       'D3R/CHK1-Kinase_RASPD_old.txt': 'CHK1 Kinase',
       'D3R/ERK2_RASPD_old.txt': 'ERK2', 
       'D3R/HSP90_RASPD_old.txt': 'HSP90',
       'D3R/LpxC_RASPD_old.txt': 'LpxC', 
       'wang/MCL_All_RASPD_old.txt': 'Mcl-1 all', 
       'wang/P38_RASPD_old.txt': 'p38',
       'wang/PTP1B_RASPD_old.txt': "PTP1B", 
       'HiQ/Hiq_Set1_RASPD_Old.txt': "CSAR HiQ 1",
       'HiQ/Hiq_Set2_RASPD_Old.txt': "CSAR HiQ 2", 
       'D3R/SYK_RASPD_old.txt': "SYK",
       'wang/TYK2_All_RASPD_old.txt': "TYK2 all",
       'wang/Thrombin_RASPD_old.txt': "Thrombin",
       'D3R/TrmD_RASPD_old.txt': "TrmD",
       'D3R/Urokinase_RASPD_old.txt': "Urokinase"}

dataset_sets = {
    "CSAR12":("CDK2-Cyclin A", "CDK2 Kinase", "CHK1 Kinase", 
              "ERK2", "LpxC", "Urokinase"),
    "CSAR14":("SYK", "TrmD", "HSP90"),
    "CSAR HiQ": ("CSAR HiQ 1", "CSAR HiQ 2"),
    "Wang":("BACE", "CDK2", "Mcl-1 all", "p38", "PTP1B", "Thrombin", "TYK2 all"),
    "D3R":("CDK2-Cyclin A", "CDK2 Kinase", "CHK1 Kinase", "ERK2", "LpxC", "Urokinase","SYK", "TrmD", "HSP90"),
}

In [4]:
data = {
    name: pd.read_csv(data_path / fname,
                      sep='\t', header=None, names=['predicted', 'true']) 
    for fname, name in all_dataset_names.items() 
}

In [5]:
def make_metrics_df(in_data):
    metrics = {
        name: dict(zip(('mse', 'mae', 'rho', 'r', 'r2'),
                   calc_metrics(dat.true, dat.predicted)))
        for name, dat in in_data.items()
    }
    metrics_df = pd.DataFrame(metrics)
    metrics_df = metrics_df.T
    metrics_df['rmse'] = np.sqrt(metrics_df['mse'])
    return metrics_df


In [6]:
make_metrics_df(data)

Unnamed: 0,mae,mse,r,r2,rho,rmse
BACE,1.22,3.137489,-0.139001,-3.98839,-0.245684,1.771296
CDK2-Cyclin A,0.73,2.515922,0.454997,-1.337796,0.393378,1.586166
CDK2 Kinase,2.51,7.580236,0.497908,-5.107739,0.593726,2.753223
CDK2,2.115,6.236369,0.107204,-3.503688,0.166299,2.497272
CHK1 Kinase,1.815,5.210156,0.119681,-1.358963,0.086169,2.282577
ERK2,1.015,2.665288,0.196353,-0.370321,0.250252,1.632571
HSP90,1.325,4.309514,0.209113,-0.488595,0.167911,2.075937
LpxC,1.33,7.166435,0.314027,-1.53505,0.23317,2.67702
Mcl-1 all,1.0,2.496294,0.708926,-0.075448,0.667315,1.579966
p38,0.915,1.186247,0.485919,-0.202501,0.392199,1.08915


In [7]:
grouped_dict = {grp_name: pd.concat(
                [data[elname] for elname in grp_items])
               for grp_name, grp_items in dataset_sets.items()}

In [8]:
make_metrics_df(grouped_dict)

Unnamed: 0,mae,mse,r,r2,rho,rmse
CSAR12,1.25,3.711207,0.289298,-0.598864,0.27022,1.926449
CSAR14,1.48,4.191742,0.316512,-0.323728,0.226306,2.047374
CSAR HiQ,1.65,9.949164,0.592027,0.212884,0.582922,3.154229
Wang,1.37,3.258884,0.54552,0.008089,0.492447,1.805238
D3R,1.38,4.005151,0.30161,-0.410698,0.221208,2.001287
