# Run Our trained Scoring function on example molecules

This Jupyter Notebook will helps users to to run our trained scoring function ANN_PLEC on their own molecules. In this Jupyter Notebook, We have used test set molecules which contain 8 actives and 330 inactives (deepcoy decoys)

Additional information can be found in our Nature Protocols paper: Tran-Nguyen, V. K., Junaid, M., Simeon, S. & Ballester, P. J. A practical guide to machine-learning scoring for structure-based virtual screening. Nat. Protoc. (2023)

We recommend users to set up the protocol-env environment before running the code in this Jupyter notebook. This can be done using the protocol-env.yml file in our MLSF-protocol github repository: https://github.com/vktrannguyen/MLSF-protocol.

For deepcoys generation, please use the github repository https://github.com/fimrie/DeepCoy/tree/master

## 1. Import all necessary Python packages

The following libraries/packages/toolkits need to be installed beforehand: jupyter notebook, pandas, oddt, sklearn, xgboost, rdkit, deepchem, joblib, tqdm, glob, tensorflow. 

In [None]:
import os
import numpy as np
import pandas as pd
import oddt
from oddt.fingerprints import PLEC
from scipy import stats
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef, precision_recall_curve, accuracy_score, auc
from sklearn.model_selection import cross_val_predict
from sklearn.neural_network import MLPClassifier
from sklearn.utils import parallel_backend
from xgboost.sklearn import XGBClassifier
from rdkit import Chem
from rdkit.Chem import AllChem
from joblib import Parallel, delayed
from tqdm import tqdm
import pickle
import glob
import tempfile
import hyperopt
from hyperopt import hp, tpe, Trials, fmin, STATUS_OK, space_eval

## 2. Load functions

In [None]:
def evaluate_performance(y, pred_y, pred_y_prob):
    # Calculate evaluation metrics
    accuracy = accuracy_score(y, pred_y)
    precision = precision_score(y, pred_y)
    recall = recall_score(y, pred_y)
    f1 = f1_score(y, pred_y)
    roc_auc = roc_auc_score(y, pred_y_prob)
    avg_precision = average_precision_score(y, pred_y_prob)
    logloss = log_loss(y, pred_y_prob)
    mcc = matthews_corrcoef(y, pred_y)
    kappa = cohen_kappa_score(y, pred_y)
    
    # Create a DataFrame to store the evaluation metrics
    evaluation_df = pd.DataFrame({
        "Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC", "Avg Precision", "Log Loss", "MCC", "Kappa"],
        "Value": [accuracy, precision, recall, f1, roc_auc, avg_precision, logloss, mcc, kappa]
    })
    
    return evaluation_df

# Example usage:
# y_true = actual_labels
# pred_y = predicted_labels
# pred_y_prob = predicted_probabilities
# evaluation_df = evaluate_performance(y_true, pred_y, pred_y_prob)
# print(evaluation_df)


In [None]:
from math import ceil
def enrichment_factor(y_true, y_score, percentage=1, pos_label=None, kind='fold'):
    """Computes enrichment factor for given percentage, i.e. EF_1% is
    enrichment factor for first percent of given samples. This function assumes
    that results are already sorted and samples with best predictions are first.

    Parameters
    ----------
    y_true : array, shape=[n_samples]
        True binary labels, in range {0,1} or {-1,1}. If positive label is
        different than 1, it must be explicitly defined.

    y_score : array, shape=[n_samples]
        Scores for tested series of samples

    percentage : int or float
        The percentage for which EF is being calculated

    pos_label: int
        Positive label of samples (if other than 1)

    kind: 'fold' or 'percentage' (default='fold')
        Two kinds of enrichment factor: fold and percentage.
        Fold shows the increase over random distribution (1 is random, the
        higher EF the better enrichment). Percentage returns the fraction of
        positive labels within the top x% of dataset.

    Returns
    -------
    ef : float
        Enrichment Factor for given percenage in range 0:1
    """
    if pos_label is None:
        pos_label = 1
    labels = y_true == pos_label
    assert labels.sum() > 0, "There are no correct predicions. Double-check the pos_label"
    assert len(labels) > 0, "Sample size must be greater than 0"
    # calculate fraction of positve labels
    n_perc = int(ceil(percentage / 100. * len(labels)))
    out = labels[:n_perc].sum() / n_perc
    if kind == 'fold':
        out /= (labels.sum() / len(labels))
    return out

In [None]:
def enrichment_factor_your_choice(data):
        data_active_score = pd.DataFrame(data['test_score_C1'])
        data_active_score['activity'] = test_plec_Cr['PLEC_4093']
        data_active_score.sort_values('test_score_C1', inplace=True, ascending=False)
        enrichment_value= round(enrichment_factor(data_active_score['activity'], data_active_score['test_score_C1'], percentage=1))
        return enrichment_value

In [None]:
def enrichment_factor_your_choice(data):
    ef_values = []
    for i in range(11):
        data_active_score = pd.DataFrame(data.iloc[:, i])
        data_active_score['activity'] = data['Observed_Activity']
        data_active_score.sort_values(data.columns[i], inplace=True, ascending=False)
        enrichment_value= round(enrichment_factor(data_active_score['activity'], data_active_score.iloc[:,0], percentage=1))
        ef_values.append(enrichment_value)
    return ef_values

In [None]:
def map_array_to_status(array):
    # Define the mapping from 0 and 1 to "inactive" and "active"
    status_mapping = {0: "inactive", 1: "active"}
    
    # Use list comprehension to map each element in the array
    status_list = [status_mapping[element] for element in array]
    
    return status_list

## 3. Load data

In [None]:
example_test_set = pd.read_csv('path_to_TS1.csv_file',index_col=None)

In [None]:
example_test_set['class'].value_counts()

In [None]:
X_example_test_set, y_example_test_set = example_test_set.drop(['int_class', 'class'], axis= 1), example_test_set['int_class']

## 4. Load Scoring function

In [None]:
filename = '/home/juni/working/ythdf/models/ANN_PLEC_MCTD8.sav'
ANN_PLEC = pickle.load(open(filename, 'rb'))

## 5. predict class

In [None]:
activity_prediction=ANN_PLEC.predict(X_example_test_set)

## 6. Map active and inactive to 0,1 class

In [None]:
status_list = map_array_to_status(activity_prediction)

In [None]:
example_test_set['predicted_class'] = status_list

In [None]:
example_test_set