In [1]:
from stanscofi.utils import load_dataset
from stanscofi.datasets import Dataset
from stanscofi.training_testing import cv_training
from stanscofi.training_testing import weakly_correlated_split, random_simple_split
from stanscofi.validation import compute_metrics, plot_metrics, metrics_list
import stanscofi.validation
import numpy as np
import pandas as pd
# import benchscofi

In [2]:
random_state = 1234
decision_threshold = 0

In [3]:
dataset_names = ["TRANSCRIPT"]
split_params = {"metric": "cosine", "test_size": 0.2, "split_randomly": True}

In [4]:
algo_params = {
    "PMF": {
        "reg": 0.01,
        "learning_rate": 0.5,
        "n_iters": 160,
        "n_factors": 15,
        "batch_size": 100,
    },
}

In [5]:
nsplits = 5
njobs = nsplits - 1

In [7]:
# Dataset
dataset_di = load_dataset("TRANSCRIPT", "../data/")
dataset_di.setdefault("same_item_user_features", {"dataset_name": "TRANSCRIPT"})
dataset_di.setdefault("name", "TRANSCRIPT")
dataset = Dataset(**dataset_di)

In [8]:
x = dataset.users

In [9]:
split_params

{'metric': 'cosine', 'test_size': 0.2, 'split_randomly': True}

In [10]:
if not split_params["split_randomly"]:
    (train_folds, test_folds), _ = weakly_correlated_split(
        dataset,
        split_params["test_size"],
        early_stop=1,
        metric=split_params["metric"],
        verbose=True,
    )
else:
    (train_folds, test_folds), _ = random_simple_split(
        dataset, split_params["test_size"], metric=split_params["metric"]
    )

In [11]:
train_folds.toarray()

array([[1., 1., 1., ..., 0., 1., 0.],
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 0., 0., ..., 1., 0., 1.],
       ...,
       [1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 1., 1.],
       [0., 1., 1., ..., 1., 0., 1.]])

In [12]:
train_dataset = dataset.subset(train_folds, subset_name="Train_" + "TRANSCRIPT")
test_dataset = dataset.subset(test_folds, subset_name="Test_" + "TRANSCRIPT")

In [13]:
train_dataset.summary()
test_dataset.summary()

----------------------------------------------------------------------
* Rating matrix: 613 drugs x 151 diseases
Including 180 drugs and 101 diseases involved in at least one positive/negative rating
321 positive, 9 negative, 92233 unlabeled (including 18513 unavailable) drug-disease ratings
Sparsity: 0.36 percent (on drugs/diseases with at least one known rating 1.82)
-----------------------------------
* Feature matrices:
#Drug features: 12096	Total #Drugs: 613
Missing features: 0.00 percent
#Disease features: 12096	Total #Disease: 151
Missing features: 0.00 percent
----------------------------------------------------------------------

----------------------------------------------------------------------
* Rating matrix: 613 drugs x 151 diseases
Including 63 drugs and 58 diseases involved in at least one positive/negative rating
80 positive, 2 negative, 92481 unlabeled (including 74050 unavailable) drug-disease ratings
Sparsity: 0.09 percent (on drugs/diseases with at least one kno

(613,
 151,
 63,
 58,
 np.int64(80),
 np.int64(2),
 np.int64(92481),
 np.int64(74050),
 np.float64(0.08858831282477879),
 np.float64(2.2441160372194857),
 12096,
 np.float64(0.0),
 12096,
 np.float64(0.0))

In [None]:
# Algorithm
__import__("benchscofi." + algo)
model = eval("benchscofi." + algo + "." + algo)(algo_params[algo])


In [None]:
# Training
model.fit(train_dataset, random_state)

In [None]:





######################
## Cross-validation ##
results = cv_training(
    eval("benchscofi." + algo + "." + algo),
    params,
    train_dataset,
    threshold=decision_threshold,
    metric="AUC",
    k=k,
    beta=beta,
    njobs=njobs,
    nsplits=nsplits,
    random_state=random_state,
    show_plots=False,
    verbose=True,
    cv_type="random" if (split_params["split_randomly"]) else "weakly_correlated",
)
model = results["models"][np.argmax(results["test_metric"])]

#################
## Predictions ##
#################
scores = model.predict_proba(test_dataset)
predictions = model.predict(scores, threshold=decision_threshold)

model.print_scores(scores)
model.print_classification(predictions)

#################
## Validation  ##
#################

## disease-wise metrics
metrics, plot_args = compute_metrics(
    scores, predictions, test_dataset, metrics=metrics_list, k=k, beta=beta, verbose=1
)  ## run all metrics
plot_args.update({"model_name": "PMF", "figsize": (8, 8)})
plot_metrics(**plot_args)

## dataset-wide metrics
y_test = (test_dataset.folds.toarray() * test_dataset.ratings.toarray()).ravel()
y_test[y_test < 1] = 0

whole_metrics = [
    eval("stanscofi.validation." + metric)(y_test, scores.toarray().ravel(), k, beta)
    for metric in metrics_list
    if (metric not in ["Fscore", "TAU"])
]

results = pd.concat(
    (
        pd.DataFrame(
            [whole_metrics],
            index=["Value"],
            columns=[m for m in metrics_list if (m not in ["Fscore", "TAU"])],
        ).T,
        metrics,
    ),
    axis=1,
)
