## Dependencies
- [PScore](https://doi.org/10.7554/eLife.31486)
- [PhaSePred](http://predict.phasep.pro) Download from [here](http://predict.phasep.pro/static/phasepred/database/human_reviewed.zip)

In [None]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.model_selection import train_test_split
import random
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import matthews_corrcoef, make_scorer
from statistics import stdev, variance, mean
import torch
from torch import nn, optim
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from scipy.interpolate import interp1d
from sklearn.decomposition import PCA

In [None]:
client_set=set(SeqIO.index("../fig1/result/drllps_client_clstr_Homo_sapiens.fasta", "fasta").keys())
scaffold_set=set(SeqIO.index("../fig1/result/drllps_scaffold_clstr_Homo_sapiens.fasta", "fasta").keys())
nonllps_set=set(SeqIO.index("../fig1/result/drllps_nonllps_clstr_Homo_sapiens.fasta", "fasta").keys())
    
mat=np.load("embedding/PTT5XLU50_human.npy", allow_pickle=True)
mat=mat.item()

list_client=[]
list_nonllps=[]
list_scaffold=[]
client_id=[]
scaffold_id=[]
nonllps_id=[]

for k in mat.keys():
    if k in nonllps_set:
        list_nonllps.append(mat[k])
        nonllps_id.append(k.split("|")[1])
    elif k in client_set:
        list_client.append(mat[k])
        client_id.append(k.split("|")[1])
    elif k in scaffold_set:
        list_scaffold.append(mat[k])
        scaffold_id.append(k.split("|")[1])

In [None]:
class EarlyStopping:
    def __init__(self, patience=5, verbose=False, path='checkpoint_model.pth'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.path = path

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:  
            self.best_score = score   
            self.checkpoint(val_loss, model)  
        elif score <= self.best_score:  
            self.counter += 1   
            if self.verbose:  
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')   
            if self.counter >= self.patience:  
                self.early_stop = True
        else:  
            self.best_score = score  
            self.checkpoint(val_loss, model)  
            self.counter = 0  
            
    def checkpoint(self, val_loss, model):
        if self.verbose:  
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)  
        self.val_loss_min = val_loss  
        
def training_loop(n_epochs, optimizer, model, loss, mask_train, x_train,  y_train):
    loss=loss
    
    n_samples=x_train.shape[0]
    n_val=int(n_samples*0.2)

    shuffled_ind=torch.randperm(n_samples)

    train_ind=shuffled_ind[:-n_val] 
    val_ind=shuffled_ind[-n_val:]
    
    x_val=x_train[val_ind]
    y_val=y_train[val_ind]
    
    x_train=x_train[train_ind]
    y_train=y_train[train_ind]
    
    x_train=x_train
    y_train=y_train
    
    x_val=x_val
    y_val=y_val

    patience=10
    earlystopping = EarlyStopping(patience=patience, verbose=False)
    for epoch in range(1, n_epochs+1):
        model.train()
        
        y_train_pred=model.forward(x_train)
        loss_train=loss(y_train_pred, y_train)
        
        model.eval()
        with torch.no_grad():
            y_val_pred=model.forward(x_val)
            loss_val=loss(y_val_pred, y_val)

        earlystopping(loss_val, model) 
        if earlystopping.early_stop: 
            break
            
        optimizer.zero_grad()
        loss_train.backward()
        optimizer.step()
        
class FNN2(nn.Module):
    def __init__(self, embeddings_dim=1024, dropout=0.25):
        super(FNN2, self).__init__()

        self.linear = nn.Sequential(
            nn.Linear(embeddings_dim, 32),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(32,2)
        )


    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
        o = self.linear(x)  
        return o
    

class NN(BaseEstimator, ClassifierMixin):
    def __init__(self, n_epochs=500, lr=0.03):
        self.n_epochs = n_epochs
        self.lr = lr
        self.model = None
        self.optim = None
        self.loss = nn.CrossEntropyLoss()

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        X_tensor = torch.tensor(X, dtype=torch.float)
        y_tensor = torch.tensor(y, dtype=torch.long)
        n_dim = X_tensor.shape[1]
        self.model=FNN2(embeddings_dim=n_dim)
        self.optim = optim.Adam(self.model.parameters(), lr=self.lr)
        training_loop(
            n_epochs=self.n_epochs,
            optimizer=self.optim,
            model=self.model,
            loss=self.loss,
            mask_train=None,
            x_train=X_tensor,
            y_train=y_tensor,
        )
        return self

    def predict(self, X):
        with torch.no_grad():
            X_tensor = torch.tensor(X, dtype=torch.float)
            self.model.eval()
            y_pred = self.model(X_tensor)
            _, predicted = torch.max(y_pred, 1)
            return predicted.numpy()

    def predict_proba(self, X):
        with torch.no_grad():
            X_tensor = torch.tensor(X, dtype=torch.float)
            self.model.eval()
            y_pred = self.model(X_tensor)
            probas = nn.Softmax(dim=1)(y_pred)
            return probas.numpy()

In [None]:
class Display_auc():
    def __init__(self,x:np.array, y:np.array, model, saps:dict, pdps:dict, pscore:dict, idx:np.array):
        self.x=x
        self.y=y
        self.model=model
        self.saps=saps
        self.pdps=pdps
        self.pscore={k.split("|")[1]:pscore[k] for k in pscore.keys()}
        self.idx=idx
        
    def run(self):
        cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
        out_y=[]
        out_y_proba=[]
        scores_rocauc=[]
        scores_prauc=[]
        scores_mcc=[]
        precisions=[]
        raw_precisions=[]
        raw_recalls=[]
        tprs = []
        
        tprs_pdps = []
        tprs_saps = []
        tprs_ps = []
        precisions_pdps = []
        precisions_saps = []
        precisions_ps = []
        rocauc_pdps = []
        rocauc_saps = []
        rocauc_ps = []
        prauc_pdps = []
        prauc_saps = []
        prauc_ps = []
        
        mean_fpr = np.linspace(0, 1, 1000)
        mean_recall = np.linspace(0, 1, 1000)
        
        for train,test in cv.split(self.x, self.y):
            self.model.fit(self.x[train],self.y[train])
            y_pred_proba=self.model.predict_proba(self.x[test])[:,1]
            y_pred=self.model.predict(self.x[test])
            fpr, tpr, _ = roc_curve(self.y[test], y_pred_proba)
            roc_auc=auc(fpr,tpr)
            precision, recall, _ = precision_recall_curve(self.y[test], y_pred_proba)
            pr_auc = auc(recall, precision)
            mcc=matthews_corrcoef(self.y[test], y_pred)
            
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            
            # interp_precision = np.interp(mean_recall, recall, precision)
            interp_precision = interp1d(recall, precision)
            resampled_precision = interp_precision(mean_recall)
            precisions.append(resampled_precision)
            raw_precisions.append(precision)
            raw_recalls.append(recall)
            
            scores_rocauc.append(roc_auc)
            scores_prauc.append(pr_auc)
            scores_mcc.append(mcc)
            
            pdps_pred=[]
            pdps_y=[]
            for i in test:
                if self.idx[i] in self.pdps:
                    pdps_pred.append(self.pdps[self.idx[i]])
                    pdps_y.append(self.y[i])
            fpr, tpr, thresholds = roc_curve(pdps_y, pdps_pred)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs_pdps.append(interp_tpr)
            
            precision, recall, _ = precision_recall_curve(pdps_y, pdps_pred)
            # interp_precision = np.interp(mean_recall, recall, precision)
            interp_precision = interp1d(recall, precision)
            resampled_precision = interp_precision(mean_recall)
            precisions_pdps.append(resampled_precision)
            rocauc_pdps.append(auc(fpr, tpr))
            prauc_pdps.append(auc(recall, precision))
            
            saps_pred=[]
            saps_y=[]
            for i in test:
                if self.idx[i] in self.saps:
                    saps_pred.append(self.saps[self.idx[i]])
                    saps_y.append(self.y[i])
            #saps_pred = [self.saps[k] for k in self.idx[test]]
            fpr, tpr, thresholds = roc_curve(saps_y, saps_pred)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs_saps.append(interp_tpr)
            
            precision, recall, _ = precision_recall_curve(saps_y, saps_pred)
            # interp_precision = np.interp(mean_recall, recall, precision)
            interp_precision = interp1d(recall, precision)
            resampled_precision = interp_precision(mean_recall)
            precisions_saps.append(resampled_precision)
            rocauc_saps.append(auc(fpr, tpr))
            prauc_saps.append(auc(recall, precision))
            
            ps_pred=[]
            ps_y=[]
            for i in test:
                if self.idx[i] in self.pscore:
                    ps_pred.append(self.pscore[self.idx[i]])
                    ps_y.append(self.y[i])
            #ps_pred = [self.pscore[k] for k in self.idx[test]]
            #scaler = MinMaxScaler(feature_range=(0, 1))
            #ps_pred = scaler.fit_transform(ps_pred)
            fpr, tpr, thresholds = roc_curve(ps_y, ps_pred)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs_ps.append(interp_tpr)
            
            precision, recall, _ = precision_recall_curve(ps_y, ps_pred)
            interp_precision = interp1d(recall, precision)
            resampled_precision = interp_precision(mean_recall)
            precisions_ps.append(resampled_precision)
            rocauc_ps.append(auc(fpr, tpr))
            prauc_ps.append(auc(recall, precision))
            
        mean_tpr = np.mean(tprs, axis=0)
        mean_precision=np.mean(precisions, axis=0)
        mean_tpr[-1] = 1.0
        self.mean_tpr = mean_tpr
        self.tprs = tprs
        self.precisions = raw_precisions
        self.recall = raw_recalls
        self.mean_precision = mean_precision
        self.scores_rocauc = scores_rocauc
        self.scores_prauc = scores_prauc
        self.scores_mcc = scores_mcc
        
        self.mean_tpr_pdps = np.mean(tprs_pdps, axis=0)
        self.mean_tpr_saps = np.mean(tprs_saps, axis=0)
        self.mean_tpr_ps = np.mean(tprs_ps, axis=0)
        
        self.mean_precision_pdps = np.mean(precisions_pdps, axis=0)
        self.mean_precision_saps = np.mean(precisions_saps, axis=0)
        self.mean_precision_ps = np.mean(precisions_ps, axis=0)
        
        self.rocauc_pdps = rocauc_pdps
        self.rocauc_saps = rocauc_saps
        self.rocauc_ps = rocauc_ps
        
        self.prauc_pdps = prauc_pdps
        self.prauc_saps = prauc_saps
        self.prauc_ps = prauc_ps

In [None]:
df=pd.read_json("human_reviewed.json") #PhaSePred
df=df.loc["PhaSePred",:]
pdps10={k:df[k]['PdPS-10fea'] for k in df.index}
saps10={k:df[k]['SaPS-10fea'] for k in df.index}
cli_dict={}
with open("../fig1/result/human_client_pscore.txt", "r") as f:
    re=f.read()
    lines=re.split("\n")
    for l in lines:
        if l=="":
            continue
        words=l.split()
        cli_dict[words[2][1:]] = float(words[1])
oth_dict={}
with open("../fig1/result/human_nonllps_pscore.txt", "r") as f:
    re=f.read()
    lines=re.split("\n")
    for l in lines:
        if l=="":
            continue
        words=l.split()
        oth_dict[words[2][1:]] = float(words[1])
sca_dict={}
with open("../fig1/result/human_scaffold_pscore.txt", "r") as f:
    re=f.read()
    lines=re.split("\n")
    for l in lines:
        if l=="":
            continue
        words=l.split()
        sca_dict[words[2][1:]] = float(words[1])
pscore=cli_dict|oth_dict|sca_dict

## Scaffold vs. Non-LLPS

In [None]:
x=np.array(list_scaffold+list_nonllps)
idx=np.array(scaffold_id+nonllps_id)
y=np.array([True]*len(list_scaffold) + [False]*len(list_nonllps))
estimators = [
    ('nn', make_pipeline(StandardScaler(), PCA(n_components=128), NN(lr=0.05))),
    ('rf', make_pipeline(StandardScaler(), PCA(n_components=128), RandomForestClassifier(max_depth=5, max_features="log2", class_weight="balanced", n_estimators=200, n_jobs=40))),
    ('svm', make_pipeline(StandardScaler(), PCA(n_components=128), SVC(class_weight="balanced", probability=True, C=1, kernel="rbf", gamma="scale"))),
    ('hgboost', make_pipeline(StandardScaler(), PCA(n_components=64), HistGradientBoostingClassifier(learning_rate=0.1, max_leaf_nodes=31, min_samples_leaf=40, class_weight="balanced")))
]
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
model=StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(), n_jobs=-1, cv=cv
)

In [None]:
display=Display_auc(x,y,model,pdps=pdps10,saps=saps10,pscore=pscore, idx=idx)
display.run()

In [None]:
print("ROC-AUC:{}, PR-AUC:{}, MCC:{}".format(mean(display.scores_rocauc), mean(display.scores_prauc), mean(display.scores_mcc)))

In [None]:
mean_fpr = np.linspace(0, 1, 1000)
mean_tpr_pdps=display.mean_tpr_pdps
print("PdPS ROC-AUC:{}".format(auc(mean_fpr, mean_tpr_pdps)))
mean_tpr_saps=display.mean_tpr_saps
print("SaPS ROC-AUC:{}".format(auc(mean_fpr, mean_tpr_saps)))
mean_tpr_ps=display.mean_tpr_ps
print("PS ROC-AUC:{}".format(auc(mean_fpr, mean_tpr_ps)))

In [None]:
mean_recall = np.linspace(0, 1, 1000)
mean_precision_pdps=display.mean_precision_pdps
print("PdPS PR-AUC:{}".format(auc(mean_recall, mean_precision_pdps)))
mean_precision_saps=display.mean_precision_saps
print("SaPS PR-AUC:{}".format(auc(mean_recall, mean_precision_saps)))
mean_precision_ps=display.mean_precision_ps
print("PS PR-AUC:{}".format(auc(mean_recall, mean_precision_ps)))

In [None]:
from scipy.stats import wilcoxon

In [None]:
w, p = wilcoxon(display.scores_prauc, display.prauc_pdps)

print(f"W-statistic: {w}")
print(f"vs. PdPS: P-value: {p}")

w, p = wilcoxon(display.scores_prauc, display.prauc_saps)

print(f"W-statistic: {w}")
print(f"vs. SaPS: P-value: {p}")

w, p = wilcoxon(display.scores_prauc, display.prauc_ps)

print(f"W-statistic: {w}")
print(f"vs. PScore: P-value: {p}")

In [None]:
w, p = wilcoxon(display.scores_rocauc, display.rocauc_pdps)

print(f"W-statistic: {w}")
print(f"vs. PdPS: P-value: {p}")

w, p = wilcoxon(display.scores_rocauc, display.rocauc_saps)

print(f"W-statistic: {w}")
print(f"vs. SaPS: P-value: {p}")

w, p = wilcoxon(display.scores_rocauc, display.rocauc_ps)

print(f"W-statistic: {w}")
print(f"vs. PScore: P-value: {p}")

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
fpr = np.linspace(0, 1, 1000)
for i, tpr in enumerate(display.tprs):
    ax.plot(
    fpr,
    tpr,
    color="gray",
    lw=1,
    alpha=0.3
    )
mean_tpr = display.mean_tpr
mean_auc=mean(display.scores_rocauc)
std_auc=stdev(display.scores_rocauc)
ax.plot(
    fpr,
    mean_tpr,
    color="b",
    label=r"Seq2Phase (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_tpr = display.mean_tpr_pdps
mean_auc=mean(display.rocauc_pdps)
std_auc=stdev(display.rocauc_pdps)
ax.plot(
    fpr,
    mean_tpr,
    color="orangered",
    label=r"PhaSePred-PdPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_tpr = display.mean_tpr_saps
mean_auc=mean(display.rocauc_saps)
std_auc=stdev(display.rocauc_saps)
ax.plot(
    fpr,
    mean_tpr,
    color="limegreen",
    label=r"PhaSePred-SaPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_tpr = display.mean_tpr_ps
mean_auc=mean(display.rocauc_ps)
std_auc=stdev(display.rocauc_ps)
ax.plot(
    fpr,
    mean_tpr,
    color="gold",
    label=r"PScore (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", alpha=0.8)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    #title="Human SVM",
)
plt.title("Scaffold vs. Non-LLPS", fontsize=20)
ax.legend(loc="lower right", fontsize=10)
plt.xlabel("False positive rate", fontsize=13)
plt.ylabel("True positive rate", fontsize=13)
plt.savefig("result/fig2c_roc.pdf")
plt.show()

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
mean_recall = np.linspace(0, 1, 1000)
for recall, precision in zip(display.recall, display.precisions):
    ax.plot(
    recall,
    precision,
    color="gray",
    lw=1,
    alpha=0.3
    )
mean_precision = display.mean_precision
mean_auc=mean(display.scores_prauc)
std_auc=stdev(display.scores_prauc)
ax.plot(
    mean_recall,
    mean_precision,
    color="b",
    label=r"Seq2Phase (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_precision = display.mean_precision_pdps
mean_auc=mean(display.prauc_pdps)
std_auc=stdev(display.prauc_pdps)
ax.plot(
    mean_recall,
    mean_precision,
    color="orangered",
    label=r"PhaSePred-PdPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_precision = display.mean_precision_saps
mean_auc=mean(display.prauc_saps)
std_auc=stdev(display.prauc_saps)
ax.plot(
    mean_recall,
    mean_precision,
    color="limegreen",
    label=r"PhaSePred-SaPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_precision = display.mean_precision_ps
mean_auc=mean(display.prauc_ps)
std_auc=stdev(display.prauc_ps)
ax.plot(
    mean_recall,
    mean_precision,
    color="gold",
    label=r"PScore (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

plt.xlabel("Recall", fontsize=13)
plt.ylabel("Precision", fontsize=13)
plt.title("Scaffold vs. Non-LLPS", fontsize=20)
ax.legend(loc="upper right", fontsize=10)
plt.savefig("result/fig2d_pr.pdf")
plt.show()

## Scaffold vs. Client

In [None]:
x=np.array(list_scaffold+list_client)
idx=np.array(scaffold_id+client_id)
y=np.array([True]*len(list_scaffold) + [False]*len(list_client))
estimators = [
    ('nn', make_pipeline(StandardScaler(), PCA(n_components=128), NN(lr=0.05))),
    ('rf', make_pipeline(StandardScaler(), PCA(n_components=128), RandomForestClassifier(max_depth=5, max_features=None, class_weight="balanced", n_estimators=200, n_jobs=40))),
    ('svm', make_pipeline(StandardScaler(), PCA(n_components=32), SVC(class_weight="balanced", probability=True, C=1, kernel="rbf", gamma="scale"))),
    ('hgboost', make_pipeline(StandardScaler(), PCA(n_components=64), HistGradientBoostingClassifier(learning_rate=0.05, max_leaf_nodes=15, min_samples_leaf=20, class_weight="balanced")))
]
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
model=StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(), n_jobs=-1, cv=cv
)

In [None]:
display=Display_auc(x,y,model,pdps=pdps10,saps=saps10,pscore=pscore, idx=idx)
display.run()

In [None]:
print("ROC-AUC:{}, PR-AUC:{}, MCC:{}".format(mean(display.scores_rocauc), mean(display.scores_prauc), mean(display.scores_mcc)))

In [None]:
mean_fpr = np.linspace(0, 1, 1000)
mean_tpr_pdps=display.mean_tpr_pdps
print("PdPS ROC-AUC:{}".format(auc(mean_fpr, mean_tpr_pdps)))
mean_tpr_saps=display.mean_tpr_saps
print("SaPS ROC-AUC:{}".format(auc(mean_fpr, mean_tpr_saps)))
mean_tpr_ps=display.mean_tpr_ps
print("PS ROC-AUC:{}".format(auc(mean_fpr, mean_tpr_ps)))

In [None]:
mean_recall = np.linspace(0, 1, 1000)
mean_precision_pdps=display.mean_precision_pdps
print("PdPS PR-AUC:{}".format(auc(mean_recall, mean_precision_pdps)))
mean_precision_saps=display.mean_precision_saps
print("SaPS PR-AUC:{}".format(auc(mean_recall, mean_precision_saps)))
mean_precision_ps=display.mean_precision_ps
print("PS PR-AUC:{}".format(auc(mean_recall, mean_precision_ps)))

In [None]:
w, p = wilcoxon(display.scores_prauc, display.prauc_pdps)

print(f"W-statistic: {w}")
print(f"vs. PdPS: P-value: {p}")

w, p = wilcoxon(display.scores_prauc, display.prauc_saps)

print(f"W-statistic: {w}")
print(f"vs. SaPS: P-value: {p}")

w, p = wilcoxon(display.scores_prauc, display.prauc_ps)

print(f"W-statistic: {w}")
print(f"vs. PScore: P-value: {p}")

In [None]:
w, p = wilcoxon(display.scores_rocauc, display.rocauc_pdps)

print(f"W-statistic: {w}")
print(f"vs. PdPS: P-value: {p}")

w, p = wilcoxon(display.scores_rocauc, display.rocauc_saps)

print(f"W-statistic: {w}")
print(f"vs. SaPS: P-value: {p}")

w, p = wilcoxon(display.scores_rocauc, display.rocauc_ps)

print(f"W-statistic: {w}")
print(f"vs. PScore: P-value: {p}")

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
fpr = np.linspace(0, 1, 1000)
for i, tpr in enumerate(display.tprs):
    ax.plot(
    fpr,
    tpr,
    color="gray",
    lw=1,
    alpha=0.3
    )


mean_tpr = display.mean_tpr_pdps
mean_auc=mean(display.rocauc_pdps)
std_auc=stdev(display.rocauc_pdps)
ax.plot(
    fpr,
    mean_tpr,
    color="orangered",
    label=r"PhaSePred-PdPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_tpr = display.mean_tpr_saps
mean_auc=mean(display.rocauc_saps)
std_auc=stdev(display.rocauc_saps)
ax.plot(
    fpr,
    mean_tpr,
    color="limegreen",
    label=r"PhaSePred-SaPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_tpr = display.mean_tpr_ps
mean_auc=mean(display.rocauc_ps)
std_auc=stdev(display.rocauc_ps)
ax.plot(
    fpr,
    mean_tpr,
    color="gold",
    label=r"PScore (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", alpha=0.8)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    #title="Human SVM",
)

mean_tpr = display.mean_tpr
mean_auc=mean(display.scores_rocauc)
std_auc=stdev(display.scores_rocauc)
ax.plot(
    fpr,
    mean_tpr,
    color="b",
    label=r"Seq2Phase (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

plt.title("Scaffold vs. Client", fontsize=20)
plt.xlabel("False positive rate", fontsize=13)
plt.ylabel("True positive rate", fontsize=13)
handles, labels = ax.get_legend_handles_labels()
order = [3,0,1,2]
ax.legend([handles[idx] for idx in order], [labels[idx] for idx in order], loc="lower right", fontsize=10)
plt.savefig("result/fige_roc.pdf")
plt.show()

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
mean_recall = np.linspace(0, 1, 1000)
for recall, precision in zip(display.recall, display.precisions):
    ax.plot(
    recall,
    precision,
    color="gray",
    lw=1,
    alpha=0.3
    )


mean_precision = display.mean_precision_pdps
mean_auc=mean(display.prauc_pdps)
std_auc=stdev(display.prauc_pdps)
ax.plot(
    mean_recall,
    mean_precision,
    color="orangered",
    label=r"PhaSePred-PdPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_precision = display.mean_precision_saps
mean_auc=mean(display.prauc_saps)
std_auc=stdev(display.prauc_saps)
ax.plot(
    mean_recall,
    mean_precision,
    color="limegreen",
    label=r"PhaSePred-SaPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_precision = display.mean_precision_ps
mean_auc=mean(display.prauc_ps)
std_auc=stdev(display.prauc_ps)
ax.plot(
    mean_recall,
    mean_precision,
    color="gold",
    label=r"PScore (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

mean_precision = display.mean_precision
mean_auc=mean(display.scores_prauc)
std_auc=stdev(display.scores_prauc)
ax.plot(
    mean_recall,
    mean_precision,
    color="b",
    label=r"Seq2Phase (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

plt.xlabel("Recall", fontsize=13)
plt.ylabel("Precision", fontsize=13)
plt.title("Scaffold vs. Client", fontsize=20)
handles, labels = ax.get_legend_handles_labels()
order = [3,0,1,2]
ax.legend([handles[idx] for idx in order], [labels[idx] for idx in order], loc="upper right", fontsize=10)
plt.savefig("result/fig2f_pr.pdf")
plt.show()

## Client vs. Non-LLPS

In [None]:
x=np.array(list_client+list_nonllps)
idx=np.array(client_id+nonllps_id)
y=np.array([True]*len(list_client) + [False]*len(list_nonllps))
estimators = [
    ('nn', NN(lr=0.01)),
    ('rf', RandomForestClassifier(max_depth=20, max_features="sqrt", class_weight="balanced",n_estimators=200, n_jobs=40)),
    ('svm', make_pipeline(StandardScaler(), SVC(class_weight="balanced", probability=True, gamma="auto"))),
    ('hgboost', HistGradientBoostingClassifier(learning_rate=0.1, max_leaf_nodes=63, min_samples_leaf=80, class_weight="balanced"))
]
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
model=StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(), n_jobs=-1, cv=cv
)

In [None]:
display=Display_auc(x,y,model,pdps=pdps10,saps=saps10,pscore=pscore, idx=idx)
display.run()

In [None]:
print("ROC-AUC:{}, PR-AUC:{}, MCC:{}".format(mean(display.scores_rocauc), mean(display.scores_prauc), mean(display.scores_mcc)))

In [None]:
mean_fpr = np.linspace(0, 1, 1000)
mean_tpr_pdps=display.mean_tpr_pdps
print("PdPS ROC-AUC:{}".format(auc(mean_fpr, mean_tpr_pdps)))
mean_tpr_saps=display.mean_tpr_saps
print("SaPS ROC-AUC:{}".format(auc(mean_fpr, mean_tpr_saps)))
mean_tpr_ps=display.mean_tpr_ps
print("PS ROC-AUC:{}".format(auc(mean_fpr, mean_tpr_ps)))

In [None]:
mean_recall = np.linspace(0, 1, 1000)
mean_precision_pdps=display.mean_precision_pdps
print("PdPS PR-AUC:{}".format(auc(mean_recall, mean_precision_pdps)))
mean_precision_saps=display.mean_precision_saps
print("SaPS PR-AUC:{}".format(auc(mean_recall, mean_precision_saps)))
mean_precision_ps=display.mean_precision_ps
print("PS PR-AUC:{}".format(auc(mean_recall, mean_precision_ps)))

In [None]:
w, p = wilcoxon(display.scores_prauc, display.prauc_pdps)

print(f"W-statistic: {w}")
print(f"vs. PdPS: P-value: {p}")

w, p = wilcoxon(display.scores_prauc, display.prauc_saps)

print(f"W-statistic: {w}")
print(f"vs. SaPS: P-value: {p}")

w, p = wilcoxon(display.scores_prauc, display.prauc_ps)

print(f"W-statistic: {w}")
print(f"vs. PScore: P-value: {p}")

In [None]:
w, p = wilcoxon(display.scores_rocauc, display.rocauc_pdps)

print(f"W-statistic: {w}")
print(f"vs. PdPS: P-value: {p}")

w, p = wilcoxon(display.scores_rocauc, display.rocauc_saps)

print(f"W-statistic: {w}")
print(f"vs. SaPS: P-value: {p}")

w, p = wilcoxon(display.scores_rocauc, display.rocauc_ps)

print(f"W-statistic: {w}")
print(f"vs. PScore: P-value: {p}")

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
fpr = np.linspace(0, 1, 1000)
for i, tpr in enumerate(display.tprs):
    ax.plot(
    fpr,
    tpr,
    color="gray",
    lw=1,
    alpha=0.3
    )


mean_tpr = display.mean_tpr_pdps
mean_auc=mean(display.rocauc_pdps)
std_auc=stdev(display.rocauc_pdps)
ax.plot(
    fpr,
    mean_tpr,
    color="orangered",
    label=r"PhaSePred-PdPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_tpr = display.mean_tpr_saps
mean_auc=mean(display.rocauc_saps)
std_auc=stdev(display.rocauc_saps)
ax.plot(
    fpr,
    mean_tpr,
    color="limegreen",
    label=r"PhaSePred-SaPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_tpr = display.mean_tpr_ps
mean_auc=mean(display.rocauc_ps)
std_auc=stdev(display.rocauc_ps)
ax.plot(
    fpr,
    mean_tpr,
    color="gold",
    label=r"PScore (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", alpha=0.8)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
)

mean_tpr = display.mean_tpr
mean_auc=mean(display.scores_rocauc)
std_auc=stdev(display.scores_rocauc)
ax.plot(
    fpr,
    mean_tpr,
    color="b",
    label=r"Seq2Phase (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

plt.title("Client vs. Non-LLPS", fontsize=20)
plt.xlabel("False positive rate", fontsize=13)
plt.ylabel("True positive rate", fontsize=13)
handles, labels = ax.get_legend_handles_labels()
order = [3,0,1,2]
ax.legend([handles[idx] for idx in order], [labels[idx] for idx in order], loc="lower right", fontsize=10)
plt.savefig("result/fig_S2a_roc.pdf")
plt.show()

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
mean_recall = np.linspace(0, 1, 1000)
for recall, precision in zip(display.recall, display.precisions):
    ax.plot(
    recall,
    precision,
    color="gray",
    lw=1,
    alpha=0.3
    )


mean_precision = display.mean_precision_pdps
mean_auc=mean(display.prauc_pdps)
std_auc=stdev(display.prauc_pdps)
ax.plot(
    mean_recall,
    mean_precision,
    color="orangered",
    label=r"PhaSePred-PdPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_precision = display.mean_precision_saps
mean_auc=mean(display.prauc_saps)
std_auc=stdev(display.prauc_saps)
ax.plot(
    mean_recall,
    mean_precision,
    color="limegreen",
    label=r"PhaSePred-SaPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)


mean_precision = display.mean_precision_ps
mean_auc=mean(display.prauc_ps)
std_auc=stdev(display.prauc_ps)
ax.plot(
    mean_recall,
    mean_precision,
    color="gold",
    label=r"PScore (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

mean_precision = display.mean_precision
mean_auc=mean(display.scores_prauc)
std_auc=stdev(display.scores_prauc)
ax.plot(
    mean_recall,
    mean_precision,
    color="b",
    label=r"Seq2Phase (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
)

plt.xlabel("Recall", fontsize=13)
plt.ylabel("Precision", fontsize=13)
plt.title("Client vs. Non_LLPS", fontsize=20)
handles, labels = ax.get_legend_handles_labels()
order = [3,0,1,2]
ax.legend([handles[idx] for idx in order], [labels[idx] for idx in order], loc="upper right", fontsize=8)
plt.savefig("result/fig_S2b_pr.pdf")
plt.show()