In [None]:
import prodec
from pcm_dataset import AR_PCM_both
from qsprpred.data.utils.datasplitters import randomsplit, temporalsplit, scaffoldsplit
from qsprpred.extra.data.utils.datasplitters import LeaveTargetsOut, StratifiedPerTarget, TemporalPerTarget
from qsprpred.data.utils.featurefilters import highCorrelationFilter, lowVarianceFilter
from qsprpred.data.utils.descriptorcalculator import MoleculeDescriptorsCalculator
from qsprpred.extra.data.utils.descriptorcalculator import ProteinDescriptorCalculator
from qsprpred.data.utils.descriptorsets import FingerprintSet
from qsprpred.extra.data.utils.descriptorsets import ProDecDescriptorSet
from qsprpred.extra.data.utils.descriptor_utils.msa_calculator import ClustalMSA
from qsprpred.extra.data.data import PCMDataset
from qsprpred.models.tasks import TargetTasks

from sklearn.ensemble import RandomForestClassifier
from qsprpred.extra.models.pcm import QSPRsklearnPCM
from qsprpred.plotting.classification import ROCPlot, MetricsPlot
from qsprpred.models.interfaces import QSPRModel
from qsprpred.models.hyperparam_optimization import OptunaOptimization


## Fingerprints and descriptors

In [None]:
from qsprpred.data.utils.descriptorsets import rdkit_descs


calc_prot_Zscale = ProteinDescriptorCalculator(
    descsets=[ProDecDescriptorSet(sets=["Zscale Hellberg"])],
    msa_provider=ClustalMSA(out_dir="data")
)


# # zscale binned
# calc_prot_ZscaleBinned = ProteinDescriptorCalculator(
#     descsets=[ProDecDescriptorSet(sets=["Zscale binary"])],
#     msa_provider=ClustalMSA(out_dir="data")
# )

# #zscale binned with protfp pca
# calc_prot_ZscaleBinned_protFPpca = ProteinDescriptorCalculator(
#     descsets=[ProDecDescriptorSet(sets=["Zscale binary", 'ProtFP PCA'])],
#     msa_provider=ClustalMSA(out_dir="data")
# )

# # zscale pc 5 
# calc_prot_ZscalePC5 = ProteinDescriptorCalculator(
#     descsets=[ProDecDescriptorSet(sets=["Zscale Sandberg"])],
#     msa_provider=ClustalMSA(out_dir="data")
# )

# zscal 3 and prtofp feature (hash)
calc_prot_ZscalePC3_protFPfeature = ProteinDescriptorCalculator(
    descsets=[ProDecDescriptorSet(sets=["Zscale van Westen", 'ProtFP hash'])],
    msa_provider=ClustalMSA(out_dir="data")
)


# MS-WHIM
calc_prot_MS_WHIM = ProteinDescriptorCalculator(
    descsets=[ProDecDescriptorSet(sets=["MS-WHIM"])],
    msa_provider=ClustalMSA(out_dir="data")
)

# Molecular fingerprints
calc_mol_MFP = MoleculeDescriptorsCalculator(descsets = [FingerprintSet(fingerprint_type="MorganFP", radius=3, nBits=2048)])
calc_mol_Rdkit = MoleculeDescriptorsCalculator(descsets = [rdkit_descs()])
calc_mol_MFP_Rdkit = MoleculeDescriptorsCalculator(descsets = [FingerprintSet(fingerprint_type="MorganFP", radius=3, nBits=2048), rdkit_descs()])


# calc_mol_CDK = MoleculeDescriptorsCalculator(descsets = [FingerprintSet(fingerprint_type="CDKFP")])
# calc_mol_CDKExtendedFP = MoleculeDescriptorsCalculator(descsets = [FingerprintSet(fingerprint_type="CDKExtendedFP")])
# calc_mol_CDKEStateFP = MoleculeDescriptorsCalculator(descsets = [FingerprintSet(fingerprint_type="CDKEStateFP")])
# calc_mol_CDKGraphOnlyFP = MoleculeDescriptorsCalculator(descsets = [FingerprintSet(fingerprint_type="CDKGraphOnlyFP")])
# calc_mol_CDKMACCSFP = MoleculeDescriptorsCalculator(descsets = [FingerprintSet(fingerprint_type="CDKMACCSFP")])
# calc_mol_CDKPubchemFP = MoleculeDescriptorsCalculator(descsets = [FingerprintSet(fingerprint_type="CDKPubchemFP")])
# calc_mol_CDKSubstructureFP = MoleculeDescriptorsCalculator(descsets = [FingerprintSet(fingerprint_type="CDKSubstructureFP")])
# calc_mol_CDKKlekotaRothFP = MoleculeDescriptorsCalculator(descsets = [FingerprintSet(fingerprint_type="CDKKlekotaRothFP")])
# calc_mol_CDKAtomPairs2DFP = MoleculeDescriptorsCalculator(descsets = [FingerprintSet(fingerprint_type="CDKAtomPairs2DFP")])
# Molecular Descriptors
# calc_mol_Count = MoleculeDescriptorsCalculator(descsets = [PaDEL_calc(['nAtom'])])



# Data prep

## activity == ["Ki", "IC50"]

In [None]:
ds = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_Ki_IC50")
print("protein column: ", ds.proteincol)
print("protein sequence provider: ", ds.proteinseqprovider)
print("Size: ", len(ds))
print("Target properties: ", ds.targetProperties)
ds.nJobs = 4
ds.getDF()

In [None]:
ds.prepareDataset(
    split=StratifiedPerTarget(splitter=randomsplit(0.2)),
    feature_calculators=[calc_prot_Zscale, calc_mol_MFP]
)

In [None]:
ds.getFeatures()[0] # training set feature matrix

In [None]:
ds.getFeatures()[1] # test set feature matrix

## activity == ["Ki", "IC50", "EC50", "Kd"]

In [None]:
ds2 = AR_PCM_both(["Ki", "IC50", "EC50", "Kd"], "AR_LIGANDS_Ki_IC50_EC50_Kd")
print("protein column: ", ds2.proteincol)
print("protein sequence provider: ", ds2.proteinseqprovider)
print("Size: ", len(ds2))
print("Target properties: ", ds2.targetProperties)
ds2.nJobs = 4
ds2.getDF()

In [None]:
ds2.prepareDataset(
    split=StratifiedPerTarget(splitter=randomsplit(0.2)), 
    feature_calculators=[calc_prot_Zscale, calc_mol_MFP]
)

In [None]:
ds2.getFeatures()[0] # training set feature matrix

In [None]:
ds2.getFeatures()[1] # test set feature matrix

In [None]:
ds.save()
ds2.save()

In [None]:
ds.name
ds.storeDir

## Different descriptors and fingerprints

In [None]:
# rd kit descriptors  => pharmacore

# AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_Counts")

# # zscal 3 and prtofp feature (hash)
ds_ZPC3_protFPfeature_MorganFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_ZPC3_protFPfeature_MorganFP", data_dir= 'data/LeaveOutSplit_4')
ds_ZPC3_protFPfeature_Rdkit = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_ZPC3_protFPfeature_Rdkit", data_dir= 'data/LeaveOutSplit_4')
ds_ZPC3_protFPfeature_MorganFP_Rdkit = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_ZPC3_protFPfeature_MorganFP_Rdkit", data_dir= 'data/LeaveOutSplit_4')

# # MS-WHIM
ds_MS_WHIM_MorganFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_MS_WHIM_MorganFP", data_dir= 'data/LeaveOutSplit_4')
ds_MS_WHIM_Rdkit = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_MS_WHIM_Rdkit", data_dir= 'data/LeaveOutSplit_4')
ds_MS_WHIM_MorganFP_Rdkit = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_MS_WHIM_MorganFP_Rdkit", data_dir= 'data/LeaveOutSplit_4')


## not needed anymore:
# zscale binned
# ds_ZBinned_MorganFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_ZBinned_MorganFP")
# ds_ZBinned_Rdkit = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_ZBinned_Rdkit")
# ds_ZBinned_MorganFP_Rdkit = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_ZBinned_MorganFP_Rdkit")

#zscale binned with protfp pca ## 
# ds_ZBinned_protFPpca_MorganFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_ZBinned_protFPpca_MorganFP")
# ds_ZBinned_protFPpca_Rdkit = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_ZBinned_protFPpca_Rdkit")
# ds_ZBinned_protFPpca_MorganFP_Rdkit = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_ZBinned_protFPpca_MorganFP_Rdkit")

# zscale pc 5
# ds_ZPC5_MorganFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_ZPC5_MorganFP")
# ds_ZPC5_Rdkit = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_ZPC5_Rdkit")
# ds_ZPC5_MorganFP_Rdkit = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_ZPC5_MorganFP_Rdkit")

# ds_CDK = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_CDK")
# ds_CDKExtendedFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_CDKExtendedFP")
# ds_CDKEStateFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_CDKEStateFP")
# ds_CDKGraphOnlyFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_CDKGraphOnlyFP")
# ds_CDKMACCSFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_CDKMACCSFP")
# ds_CDKPubchemFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_CDKPubchemFP")
# ds_CDKSubstructureFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_CDKSubstructureFP")
# ds_CDKKlekotaRothFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_CDKKlekotaRothFP") ## doesnt work
# ds_CDKAtomPairs2DFP = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_CDKAtomPairs2DFP")
# ds_Counts = AR_PCM_both(["Ki", "IC50"], "AR_LIGANDS_Counts") ## doesnt work


# ds_CDK == ds_Counts # should return false
# print(ds_CDK.proteinseqprovider)

In [None]:
DS_LIST = [ds_ZPC3_protFPfeature_MorganFP, ds_ZPC3_protFPfeature_Rdkit, ds_ZPC3_protFPfeature_MorganFP_Rdkit, ds_MS_WHIM_MorganFP, ds_MS_WHIM_Rdkit, ds_MS_WHIM_MorganFP_Rdkit]
for ds in DS_LIST:
    ds.save()

In [None]:
ds_ZPC3_protFPfeature_MorganFP.getDF().columns

In [None]:
ds_ZPC3_protFPfeature_MorganFP.getDF()[["accession", "Year"]].query('accession == "P29274"').sort_values('Year')

In [None]:
DS_FP_LIST = [[ds_ZPC3_protFPfeature_MorganFP, ds_ZPC3_protFPfeature_Rdkit, ds_ZPC3_protFPfeature_MorganFP_Rdkit],
             [ds_MS_WHIM_MorganFP, ds_MS_WHIM_Rdkit, ds_MS_WHIM_MorganFP_Rdkit]]

CALC_FP_LIST = [calc_mol_MFP, calc_mol_Rdkit, calc_mol_MFP_Rdkit]

PROT_FP_LIST = [calc_prot_ZscalePC3_protFPfeature,
                calc_prot_MS_WHIM]


# # leave target out split (left out the A2A and A2B, both in human and in rat)
# for i, prot_calc in enumerate(PROT_FP_LIST):
#     for ds, calc in zip(DS_FP_LIST[i], CALC_FP_LIST):
#         ds.nJobs = 4
#         ds.prepareDataset(
#         split=LeaveTargetsOut(["P29275", "P30542", "P30543", "P29276"], ds),
#         feature_calculators=[prot_calc, calc],
#         feature_fill_value=0,
#         recalculate_features = True,
#         feature_filters = [highCorrelationFilter(0.9), lowVarianceFilter(0.0)])
        
#         ds.save()
        
# # leave target out split (left out the A1 , both in human and in rat)
# for i, prot_calc in enumerate(PROT_FP_LIST):
#     for ds, calc in zip(DS_FP_LIST[i], CALC_FP_LIST):
#         ds.nJobs = 4
#         ds.prepareDataset(
#         split=LeaveTargetsOut(["P29274", "P25099"], ds),
#         feature_calculators=[prot_calc, calc],
#         feature_fill_value=0,
#         recalculate_features = True,
#         feature_filters = [highCorrelationFilter(0.9), lowVarianceFilter(0.0)])
        
#         ds.save()

# leave target out split (left out the A3 , only in human)
# for i, prot_calc in enumerate(PROT_FP_LIST):
#     for ds, calc in zip(DS_FP_LIST[i], CALC_FP_LIST):
#         ds.nJobs = 4
#         ds.prepareDataset(
#         split=LeaveTargetsOut(["P0DMS8"], ds),
#         feature_calculators=[prot_calc, calc],
#         feature_fill_value=0,
#         recalculate_features = True,
#         feature_filters = [highCorrelationFilter(0.9), lowVarianceFilter(0.0)])
        
#         ds.save()
        
# temporal per target split
for i, prot_calc in enumerate(PROT_FP_LIST):
    for ds, calc in zip(DS_FP_LIST[i], CALC_FP_LIST):
        ds.nJobs = 4
        ds.prepareDataset(
        split=TemporalPerTarget("Year", {"P29274": 2014, "P29275": 2014, "P30542":2015, "P0DMS8": 2016,
                                         "P25099": 2012, "P30543": 2012, "P29276": 2011, "P28647": 2012}),
        feature_calculators=[prot_calc, calc],
        feature_fill_value=0,
        recalculate_features = True,
        feature_filters = [highCorrelationFilter(0.9), lowVarianceFilter(0.0)])
        
        ds.save()
        
# random split
# for i, prot_calc in enumerate(PROT_FP_LIST):
#     for ds, calc in zip(DS_FP_LIST[i], CALC_FP_LIST):
#         ds.nJobs = 4
#         ds.prepareDataset(
#         split=StratifiedPerTarget(splitter=randomsplit(0.2)),
#         feature_calculators=[prot_calc, calc],
#         feature_fill_value=0,
#         recalculate_features = True,
#         feature_filters = [highCorrelationFilter(0.9), lowVarianceFilter(0.0)])
        
#         ds.save()

In [None]:
ds_ZPC3_protFPfeature_MorganFP.getFeatures()[0]

# Loading in the datasets

In [None]:
tp = [
            {
                "name": "pchembl_value_Median",
                "task": TargetTasks.SINGLECLASS,
                "th": [6.5]
            }
        ]

In [None]:
ds = PCMDataset(name="AR_LIGANDS_Ki_IC50", store_dir="data", proteincol = "accession", target_props = tp)
ds2 = PCMDataset(name="AR_LIGANDS_Ki_IC50_EC50_Kd", store_dir="data", proteincol = "accession", target_props = tp)

# ds_CDK = PCMDataset(name="AR_LIGANDS_CDK", store_dir="data", proteincol = "accession", target_props = tp)
# ds_CDKExtendedFP = PCMDataset(name="AR_LIGANDS_CDKExtendedFP", store_dir="data", proteincol = "accession", target_props = tp)
# ds_CDKEStateFP = PCMDataset(name="AR_LIGANDS_CDKEStateFP", store_dir="data", proteincol = "accession", target_props = tp)
# ds_CDKGraphOnlyFP = PCMDataset(name="AR_LIGANDS_CDKGraphOnlyFP", store_dir="data", proteincol = "accession", target_props = tp)
# ds_CDKMACCSFP = PCMDataset(name="AR_LIGANDS_CDKMACCSFP", store_dir="data", proteincol = "accession", target_props = tp)
# ds_CDKPubchemFP = PCMDataset(name="AR_LIGANDS_CDKPubchemFP", store_dir="data", proteincol = "accession", target_props = tp)
# ds_CDKSubstructureFP = PCMDataset(name="AR_LIGANDS_CDKSubstructureFP", store_dir="data", proteincol = "accession", target_props = tp)

# ds_CDKKlekotaRothFP = PCMDataset(name="AR_LIGANDS_CDKKlekotaRothFP", store_dir="data", proteincol = "accession", target_props = tp) ## takes forever and the dataset still wasnt prepped
# ds_CDKAtomPairs2DFP = PCMDataset(name="AR_LIGANDS_CDKAtomPairs2DFP", store_dir="data", proteincol = "accession", target_props = tp)
# ds_Counts = PCMDataset(name="AR_LIGANDS_Counts", store_dir="data", proteincol = "accession", target_props = tp) ## takes forever and the dataset still wasnt prepped


## Z-scale with 3 PCs and protFP feature (hashed)

###### datasets

In [None]:
# temporalsplit
ds_ZPC3_protFPfeature_MorganFP = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_MorganFP", store_dir="data/LeaveOutSplit_4", proteincol = "accession", target_props = tp)
ds_ZPC3_protFPfeature_Rdkit = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_Rdkit", store_dir="data/LeaveOutSplit_4", proteincol = "accession", target_props = tp)
ds_ZPC3_protFPfeature_MorganFP_Rdkit = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_MorganFP_Rdkit", store_dir="data/LeaveOutSplit_4", proteincol = "accession", target_props = tp)


# leave target out split (left out the A2A and A2B, both in human and in rat)
# ds_ZPC3_protFPfeature_MorganFP = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_MorganFP", store_dir="data/LeaveOutSplit_1", proteincol = "accession", target_props = tp)
# ds_ZPC3_protFPfeature_Rdkit = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_Rdkit", store_dir="data/LeaveOutSplit_1", proteincol = "accession", target_props = tp)
# ds_ZPC3_protFPfeature_MorganFP_Rdkit = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_MorganFP_Rdkit", store_dir="data/LeaveOutSplit_1", proteincol = "accession", target_props = tp)

# leave target out split (left out the A1 , both in human and in rat)
# ds_ZPC3_protFPfeature_MorganFP = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_MorganFP", store_dir="data/LeaveOutSplit_2", proteincol = "accession", target_props = tp)
# ds_ZPC3_protFPfeature_Rdkit = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_Rdkit", store_dir="data/LeaveOutSplit_2", proteincol = "accession", target_props = tp)
# ds_ZPC3_protFPfeature_MorganFP_Rdkit = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_MorganFP_Rdkit", store_dir="data/LeaveOutSplit_2", proteincol = "accession", target_props = tp)

# leave target out split (left out the A3, only in human)
# ds_ZPC3_protFPfeature_MorganFP = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_MorganFP", store_dir="data/LeaveOutSplit_3", proteincol = "accession", target_props = tp)
# ds_ZPC3_protFPfeature_Rdkit = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_Rdkit", store_dir="data/LeaveOutSplit_3", proteincol = "accession", target_props = tp)
# ds_ZPC3_protFPfeature_MorganFP_Rdkit = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_MorganFP_Rdkit", store_dir="data/LeaveOutSplit_3", proteincol = "accession", target_props = tp)

# random split
# ds_ZPC3_protFPfeature_MorganFP = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_MorganFP", store_dir="data", proteincol = "accession", target_props = tp)
# ds_ZPC3_protFPfeature_Rdkit = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_Rdkit", store_dir="data", proteincol = "accession", target_props = tp)
# ds_ZPC3_protFPfeature_MorganFP_Rdkit = PCMDataset(name="AR_LIGANDS_ZPC3_protFPfeature_MorganFP_Rdkit", store_dir="data", proteincol = "accession", target_props = tp)

DS_LIST = [ds_ZPC3_protFPfeature_MorganFP, ds_ZPC3_protFPfeature_Rdkit, ds_ZPC3_protFPfeature_MorganFP_Rdkit]
for ds in DS_LIST:
    print(ds.name, len(ds))

###### Models

In [None]:
# leave target out split (left out the A2A and A2B, both in human and in rat)
# fitted_models = [QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_1/qspr/models/ZPC3_protFPfeature_MorganFP_RF_TargetSplit1/ZPC3_protFPfeature_MorganFP_RF_TargetSplit1_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_1/qspr/models/ZPC3_protFPfeature_Rdkit_RF_TargetSplit1/ZPC3_protFPfeature_Rdkit_RF_TargetSplit1_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_1/qspr/models/ZPC3_protFPfeature_MorganFP_Rdkit_RF_TargetSplit1/ZPC3_protFPfeature_MorganFP_Rdkit_RF_TargetSplit1_meta.json')]

# # leave target out split (left out the A1 and A3, both in human and in rat)
# fitted_models = [QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_2/qspr/models/ZPC3_protFPfeature_MorganFP_RF_TargetSplit2/ZPC3_protFPfeature_MorganFP_RF_TargetSplit2_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_2/qspr/models/ZPC3_protFPfeature_Rdkit_RF_TargetSplit2/ZPC3_protFPfeature_Rdkit_RF_TargetSplit2_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_2/qspr/models/ZPC3_protFPfeature_MorganFP_Rdkit_RF_TargetSplit2/ZPC3_protFPfeature_MorganFP_Rdkit_RF_TargetSplit2_meta.json')]

# leave target out split (left out the A3, only in human)
# fitted_models = [QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_3/qspr/models/ZPC3_protFPfeature_MorganFP_RF_TargetSplit3/ZPC3_protFPfeature_MorganFP_RF_TargetSplit3_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_3/qspr/models/ZPC3_protFPfeature_Rdkit_RF_TargetSplit3/ZPC3_protFPfeature_Rdkit_RF_TargetSplit3_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_3/qspr/models/ZPC3_protFPfeature_MorganFP_Rdkit_RF_TargetSplit3/ZPC3_protFPfeature_MorganFP_Rdkit_RF_TargetSplit3_meta.json')]

# split 4
fitted_models = [QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_4/qspr/models/ZPC3_protFPfeature_MorganFP_RF_TargetSplit4/ZPC3_protFPfeature_MorganFP_RF_TargetSplit4_meta.json'),
                 QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_4/qspr/models/ZPC3_protFPfeature_Rdkit_RF_TargetSplit4/ZPC3_protFPfeature_Rdkit_RF_TargetSplit4_meta.json'),
                 QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_4/qspr/models/ZPC3_protFPfeature_MorganFP_Rdkit_RF_TargetSplit4/ZPC3_protFPfeature_MorganFP_Rdkit_RF_TargetSplit4_meta.json')]


# random split
# fitted_models = [QSPRModel.fromFile('./qspr/models/ZPC3_protFPfeature_MorganFP_RF/ZPC3_protFPfeature_MorganFP_RF_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/ZPC3_protFPfeature_Rdkit_RF/ZPC3_protFPfeature_Rdkit_RF_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/ZPC3_protFPfeature_MorganFP_Rdkit_RF/ZPC3_protFPfeature_MorganFP_Rdkit_RF_meta.json')]

## MS-WHIM

###### datasets

In [None]:
# temporal split
ds_MS_WHIM_MorganFP = PCMDataset(name="AR_LIGANDS_MS_WHIM_MorganFP", store_dir="data/LeaveOutSplit_4", proteincol = "accession", target_props = tp)
ds_MS_WHIM_Rdkit = PCMDataset(name="AR_LIGANDS_MS_WHIM_Rdkit", store_dir="data/LeaveOutSplit_4", proteincol = "accession", target_props = tp)
ds_MS_WHIM_MorganFP_Rdkit = PCMDataset(name="AR_LIGANDS_MS_WHIM_MorganFP_Rdkit", store_dir="data/LeaveOutSplit_4", proteincol = "accession", target_props = tp)



# leave target out split (left out the A2A and A2B, both in human and in rat)
# ds_MS_WHIM_MorganFP = PCMDataset(name="AR_LIGANDS_MS_WHIM_MorganFP", store_dir="data/LeaveOutSplit_1", proteincol = "accession", target_props = tp)
# ds_MS_WHIM_Rdkit = PCMDataset(name="AR_LIGANDS_MS_WHIM_Rdkit", store_dir="data/LeaveOutSplit_1", proteincol = "accession", target_props = tp)
# ds_MS_WHIM_MorganFP_Rdkit = PCMDataset(name="AR_LIGANDS_MS_WHIM_MorganFP_Rdkit", store_dir="data/LeaveOutSplit_1", proteincol = "accession", target_props = tp)

# # leave target out split (left out the A1 and A3, both in human and in rat)
# ds_MS_WHIM_MorganFP = PCMDataset(name="AR_LIGANDS_MS_WHIM_MorganFP", store_dir="data/LeaveOutSplit_2", proteincol = "accession", target_props = tp)
# ds_MS_WHIM_Rdkit = PCMDataset(name="AR_LIGANDS_MS_WHIM_Rdkit", store_dir="data/LeaveOutSplit_2", proteincol = "accession", target_props = tp)
# ds_MS_WHIM_MorganFP_Rdkit = PCMDataset(name="AR_LIGANDS_MS_WHIM_MorganFP_Rdkit", store_dir="data/LeaveOutSplit_2", proteincol = "accession", target_props = tp)

# leave target out split (left out the A3, only in human)
# ds_MS_WHIM_MorganFP = PCMDataset(name="AR_LIGANDS_MS_WHIM_MorganFP", store_dir="data/LeaveOutSplit_3", proteincol = "accession", target_props = tp)
# ds_MS_WHIM_Rdkit = PCMDataset(name="AR_LIGANDS_MS_WHIM_Rdkit", store_dir="data/LeaveOutSplit_3", proteincol = "accession", target_props = tp)
# ds_MS_WHIM_MorganFP_Rdkit = PCMDataset(name="AR_LIGANDS_MS_WHIM_MorganFP_Rdkit", store_dir="data/LeaveOutSplit_3", proteincol = "accession", target_props = tp)

# # random split
# ds_MS_WHIM_MorganFP = PCMDataset(name="AR_LIGANDS_MS_WHIM_MorganFP", store_dir="data", proteincol = "accession", target_props = tp)
# ds_MS_WHIM_Rdkit = PCMDataset(name="AR_LIGANDS_MS_WHIM_Rdkit", store_dir="data", proteincol = "accession", target_props = tp)
# ds_MS_WHIM_MorganFP_Rdkit = PCMDataset(name="AR_LIGANDS_MS_WHIM_MorganFP_Rdkit", store_dir="data", proteincol = "accession", target_props = tp)

DS_LIST = [ds_MS_WHIM_MorganFP, ds_MS_WHIM_Rdkit, ds_MS_WHIM_MorganFP_Rdkit]

for ds in DS_LIST:
    print(ds.name, len(ds))

###### Models

In [None]:
# leave target out split (left out the A2A and A2B, both in human and in rat)
# fitted_models = [QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_1/qspr/models/MS_WHIM_MorganFP_RF_TargetSplit1/MS_WHIM_MorganFP_RF_TargetSplit1_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_1/qspr/models/MS_WHIM_Rdkit_RF_TargetSplit1/MS_WHIM_Rdkit_RF_TargetSplit1_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_1/qspr/models/MS_WHIM_MorganFP_Rdkit_RF_TargetSplit1/MS_WHIM_MorganFP_Rdkit_RF_TargetSplit1_meta.json')]

# # leave target out split (left out the A1 and A3, both in human and in rat)
# fitted_models = [QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_2/qspr/models/MS_WHIM_MorganFP_RF_TargetSplit2/MS_WHIM_MorganFP_RF_TargetSplit2_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_2/qspr/models/MS_WHIM_Rdkit_RF_TargetSplit2/MS_WHIM_Rdkit_RF_TargetSplit2_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_2/qspr/models/MS_WHIM_MorganFP_Rdkit_RF_TargetSplit2/MS_WHIM_MorganFP_Rdkit_RF_TargetSplit2_meta.json')]

# leave target out split (left out the A3, only in human)
# fitted_models = [QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_3/qspr/models/MS_WHIM_MorganFP_RF_TargetSplit3/MS_WHIM_MorganFP_RF_TargetSplit3_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_3/qspr/models/MS_WHIM_Rdkit_RF_TargetSplit3/MS_WHIM_Rdkit_RF_TargetSplit3_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_3/qspr/models/MS_WHIM_MorganFP_Rdkit_RF_TargetSplit3/MS_WHIM_MorganFP_Rdkit_RF_TargetSplit3_meta.json')]

# split 4
fitted_models = [QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_4/qspr/models/MS_WHIM_MorganFP_RF_TargetSplit4/MS_WHIM_MorganFP_RF_TargetSplit4_meta.json'),
                 QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_4/qspr/models/MS_WHIM_Rdkit_RF_TargetSplit4/MS_WHIM_Rdkit_RF_TargetSplit4_meta.json'),
                 QSPRModel.fromFile('./qspr/models/LeaveTargetSplit_4/qspr/models/MS_WHIM_MorganFP_Rdkit_RF_TargetSplit4/MS_WHIM_MorganFP_Rdkit_RF_TargetSplit4_meta.json')]




# random split
# fitted_models = [QSPRModel.fromFile('./qspr/models/MS_WHIM_MorganFP_RF/MS_WHIM_MorganFP_RF_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/MS_WHIM_Rdkit_RF/MS_WHIM_Rdkit_RF_meta.json'),
#                  QSPRModel.fromFile('./qspr/models/MS_WHIM_MorganFP_Rdkit_RF/MS_WHIM_MorganFP_Rdkit_RF_meta.json')]

# Model construction

## Random Forest

### activity comparison

In [None]:
params = {
    'n_estimators' : ["int", 50, 500],
    'n_jobs' : [4]
}


fitted_models = []
for dataset in [ds, ds2]:
    dataset.nJobs = 4
    
    model = QSPRsklearnPCM(base_dir = '.', data=dataset, alg = RandomForestClassifier, name= f"{dataset.name[11:]}_RF")

    bayesoptimizer = OptunaOptimization(scoring = model.score_func, param_grid=params, n_trials=5)
    best_params = bayesoptimizer.optimize(model)

    model.evaluate()
    model.fit()
    
    fitted_models.append(model)

fitted_models

In [None]:
plot = ROCPlot(fitted_models)
plot.make("pchembl_value_Median_class", 'cv', save = True, show=True)

In [None]:
plot = MetricsPlot(fitted_models)
figs, summary = plot.make("pchembl_value_Median_class", show=True, save=True, out_dir="qspr/models/")

In [None]:
summary.T

In [None]:
for model in fitted_models:
    model.save()

**no significant difference in metrics can be spotted between the two dataset, so for all further models, the dataset with only "Ki" and "IC50" will be used/included.**

### descriptor comparison

#### Z-scale with 3 PCs and protFP feature (hashed)

In [None]:
params = {
    'n_estimators' : ["int", 50, 500],
    'n_jobs' : ["int", 4]
}

params = {"n_estimators": ["int", 10, 2000],
        "max_depth": ["int", 1, 100],
        "min_samples_leaf": ["int", 1, 25],
        "max_features": ["int", 1, 100], 
        "min_samples_split": ["int", 2, 12],
         'n_jobs' : [4]}

In [None]:
fitted_models = []
for dataset in DS_LIST:
    dataset.nJobs = 4
    
    model = QSPRsklearnPCM(base_dir = 'qspr/models/LeaveTargetSplit_4', data=dataset, alg = RandomForestClassifier, name= f"{dataset.name[11:]}_RF_TargetSplit4")

    bayesoptimizer = OptunaOptimization(scoring = model.score_func, param_grid=params, n_trials=5)
    best_params = bayesoptimizer.optimize(model)

    model.evaluate()
    model.fit()
    model.save()
    
    fitted_models.append(model)

fitted_model

In [None]:
plot = ROCPlot(fitted_models)
plot.make("pchembl_value_Median_class", 'cv', save = False, show=True)

In [None]:
plot_metric = MetricsPlot(fitted_models)
figs, summary = plot_metric.make("pchembl_value_Median_class", show=False, save=False, out_dir="qspr/models/")

In [None]:
summary

#### MS-WHIM

In [None]:
params = {
    'n_estimators' : ["int", 50, 500],
    'n_jobs' : ["int", 4]
}

params = {"n_estimators": ["int", 10, 2000],
        "max_depth": ["int", 1, 100],
        "min_samples_leaf": ["int", 1, 25],
        "max_features": ["int", 1, 100], 
        "min_samples_split": ["int", 2, 12],
         'n_jobs' : [4]}

In [None]:
fitted_models = []
for dataset in DS_LIST:
    dataset.nJobs = 4
    
    model = QSPRsklearnPCM(base_dir = 'qspr/models/LeaveTargetSplit_4', data=dataset, alg = RandomForestClassifier, name= f"{dataset.name[11:]}_RF_TargetSplit4")

    bayesoptimizer = OptunaOptimization(scoring = model.score_func, param_grid=params, n_trials=5)
    best_params = bayesoptimizer.optimize(model)

    model.evaluate()
    model.fit()
    model.save()
    
    fitted_models.append(model)

fitted_model

In [None]:
plot = ROCPlot(fitted_models)
plot.make("pchembl_value_Median_class", 'cv', save = False, show=True)

In [None]:
plot_metric = MetricsPlot(fitted_models)
figs, summary = plot_metric.make("pchembl_value_Median_class", show=False, save=False, out_dir="qspr/models/")

In [None]:
summary