In [1]:
import numpy as np
import pandas as pd

from stanscofi.utils import load_dataset
from stanscofi.datasets import Dataset
from stanscofi.training_testing import cv_training
from stanscofi.training_testing import weakly_correlated_split, random_simple_split, metrics_list
from stanscofi.validation import compute_metrics, plot_metrics
from stanscofi.validation import AUC, Rscore, MRR, RP, PrecisionK, RecallK, F1K, AP, MAP, DCGk, NDCGk, MeanRank, HRk, ERR

from benchscofi import ALSWR, DRRS, LRSSL, PMF, SCPMF, Constant, LogisticMF

In [2]:
random_state = 1234
decision_threshold = 0

In [None]:
# Dataset
dataset_di = load_dataset("TRANSCRIPT", "../data/")
dataset = Dataset(**dataset_di)

# dataset_di = load_dataset("PREDICT", "../data/")
# dataset = Dataset(**dataset_di)

dataset.summary();

In [92]:
test_size = 0.2
metric = "cosine"

# Random split
(train_folds, test_folds), _ = random_simple_split(
    dataset, test_size, metric=metric
)

# # Weakly correlated split
# (train_folds, test_folds), _ = weakly_correlated_split(
#     dataset,
#     test_size,
#     early_stop=1,
#     metric=metric,
#     verbose=True,
# )


In [93]:
train_dataset = dataset.subset(train_folds, subset_name="Train_" + "TRANSCRIPT")
test_dataset = dataset.subset(test_folds, subset_name="Test_" + "TRANSCRIPT")

In [None]:
print("Train dataset")
train_dataset.summary()
print("Test dataset")
test_dataset.summary();

In [95]:
algo_params = {
    "PMF": {
        "reg": 0.01,
        "learning_rate": 0.5,
        "n_iters": 160,
        "n_factors": 15,
        "batch_size": 100,
    },
    "LogisticMF": {
        "counts": np.zeros((63, 58)),
        "num_factors": 2,
    },
    "ALSWR": None,
}

In [117]:
# Algorithm
# model = PMF(algo_params["PMF"])
# model = LogisticMF(algo_params["LogisticMF"])
model = ALSWR(algo_params["ALSWR"])

In [None]:
# Change the dtype of the ratings matrix to float64
dataset.ratings = dataset.ratings.astype(np.float64)
# Train
model.fit(dataset, random_state)

# Predictions
scores = model.predict_proba(test_dataset)
predictions = model.predict(scores, threshold=decision_threshold)

model.print_scores(scores)
model.print_classification(predictions)

In [None]:
dataset_di["ratings"].shape

In [None]:
np.unique_counts(dataset.ratings.todense())

In [None]:
np.unique_counts(predictions.todense())

In [140]:
predictions_pd = pd.DataFrame(predictions.todense())

# Prediction index and columns should match the dataset_di["ratings"]
predictions_pd.index = dataset_di["ratings"].index
predictions_pd.columns = dataset_di["ratings"].columns
predictions_pd.to_csv("../results/predictions.csv")


In [None]:
# Training
model.fit(train_dataset, random_state)

In [None]:
# Predictions
scores = model.predict_proba(test_dataset)
predictions = model.predict(scores, threshold=decision_threshold)

model.print_scores(scores)
model.print_classification(predictions)

In [None]:
scores.todense()

In [None]:
predictions.todense()

In [None]:
# For validation
k = 5
beta = 1
nsplits = 5
njobs = nsplits - 1

# Cross-validation
results = cv_training(
    ALSWR,
    algo_params["ALSWR"],
    train_dataset,
    threshold=decision_threshold,
    metric="AUC",
    k=k,
    beta=beta,
    njobs=njobs,
    nsplits=nsplits,
    random_state=random_state,
    show_plots=False,
    verbose=False,
    cv_type="random",
    # cv_type="weakly_correlated"
)
model = results["models"][np.argmax(results["test_metric"])]

In [None]:
# Predictions
scores = model.predict_proba(test_dataset)
predictions = model.predict(scores, threshold=decision_threshold)

model.print_scores(scores)
model.print_classification(predictions)

In [None]:
y = test_dataset.ratings.toarray()
np.unique_counts(y)

In [None]:
from scipy.sparse import coo_array, csr_array
x = predictions.toarray()
np.unique_counts(x)

In [None]:
# Validation
# disease-wise metrics
metrics, plot_args = compute_metrics(
    scores, predictions, test_dataset, metrics=metrics_list, k=k, beta=beta
)
# run all metrics
plot_args.update({"model_name": "LogisticMF", "figsize": (8, 8)})
plot_metrics(**plot_args)


In [None]:
predictions.todense()

In [None]:
# dataset-wide metrics
y_test = (test_dataset.folds.toarray() * test_dataset.ratings.toarray()).ravel()
y_test[y_test < 1] = 0

whole_metrics = [
    AUC(y_test, scores.toarray().ravel(), k, beta),
    Rscore(y_test, scores.toarray().ravel(), k, beta),
    MRR(y_test, scores.toarray().ravel(), k, beta),
    RP(y_test, scores.toarray().ravel(), k, beta),
    PrecisionK(y_test, scores.toarray().ravel(), k, beta),
    RecallK(y_test, scores.toarray().ravel(), k, beta),
    F1K(y_test, scores.toarray().ravel(), k, beta),
    AP(y_test, scores.toarray().ravel(), k, beta),
    MAP(y_test, scores.toarray().ravel(), k, beta),
    DCGk(y_test, scores.toarray().ravel(), k, beta),
    NDCGk(y_test, scores.toarray().ravel(), k, beta),
    MeanRank(y_test, scores.toarray().ravel(), k, beta),
    HRk(y_test, scores.toarray().ravel(), k, beta),
    ERR(y_test, scores.toarray().ravel(), k, beta),
]

results = pd.concat(
    (
        pd.DataFrame(
            [whole_metrics],
            index=["Value"],
            columns=[
                "AUC",
                "Rscore",
                "MRR",
                "RP",
                "PrecisionK",
                "RecallK",
                "F1K",
                "AP",
                "MAP",
                "DCGk",
                "NDCGk",
                "MeanRank",
                "HRk",
                "ERR",
            ],
        ).T,
        metrics,
    ),
    axis=1,
)

results.head()