## Dependencies
- [PScore](https://doi.org/10.7554/eLife.31486)
- [PhaSePred](http://predict.phasep.pro) Download from [here](http://predict.phasep.pro/static/phasepred/database/human_reviewed.zip)

In [None]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.model_selection import train_test_split
import random
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import matthews_corrcoef, make_scorer
from statistics import stdev, variance, mean
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [None]:
client_set=set()
scaffold_set=set()
others_set=set()
for rec in SeqIO.parse("../fig1/result/drllps_client_clstr_Homo_sapiens.fasta", "fasta"):
    client_set.add(rec.id)
for rec in SeqIO.parse("../fig1/result/drllps_scaffold_clstr_Homo_sapiens.fasta", "fasta"):
    scaffold_set.add(rec.id)
for rec in SeqIO.parse("../fig1/result/drllps_nonllps_clstr_Homo_sapiens.fasta", "fasta"):
    others_set.add(rec.id)
    
mat=np.load("PTT5XLU50_human.npy", allow_pickle=True)
mat=mat.item()

list_client=[]
list_others=[]
list_scaffold=[]
client_id=[]
others_id=[]
scaffold_id=[]
for k in mat.keys():
    if k in others_set:
        list_others.append(mat[k])
        others_id.append(k.split("|")[1])
    elif k in client_set:
        list_client.append(mat[k])
        client_id.append(k.split("|")[1])
    elif k in scaffold_set:
        list_scaffold.append(mat[k])
        scaffold_id.append(k.split("|")[1])

In [None]:
class Display_auc():
    def __init__(self,x:np.array, y:np.array, model, saps:dict, pdps:dict, pscore:dict, idx:np.array):
        self.x=x
        self.y=y
        self.model=model
        self.saps=saps
        self.pdps=pdps
        self.pscore={k.split("|")[1]:pscore[k] for k in pscore.keys()}
        self.idx=idx
        
    def cv(self, x, y, idx):
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        out_y=[]
        out_y_proba=[]
        scores_rocauc=[]
        scores_prauc=[]
        scores_mcc=[]
        tprs = []
        tprs_pdps = []
        tprs_saps = []
        tprs_ps = []
        mean_fpr = np.linspace(0, 1, 1000)
        
        for train,test in cv.split(x, y): 
            self.model.fit(x[train],y[train])
            y_pred_proba=model.predict_proba(x[test])[:,1]
            y_pred=model.predict(x[test])
            fpr, tpr, thresholds = roc_curve(y[test], y_pred_proba)
            roc_auc=auc(fpr,tpr)
            precision, recall, thresholds = precision_recall_curve(y[test], y_pred_proba)
            pr_auc = auc(recall, precision)
            mcc=matthews_corrcoef(y[test], y_pred)
            
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            
            interp_fpr = np.linspace(0, 1, 1000)
            pdps_pred=[]
            pdps_y=[]
            for i in test:
                if idx[i] in self.pdps:
                    pdps_pred.append(self.pdps[idx[i]])
                    pdps_y.append(y[i])
            fpr, tpr, thresholds = roc_curve(pdps_y, pdps_pred)
            interp_tpr = np.interp(interp_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs_pdps.append(interp_tpr)
            
            saps_pred=[]
            saps_y=[]
            for i in test:
                if idx[i] in self.saps:
                    saps_pred.append(self.saps[idx[i]])
                    saps_y.append(y[i])
            fpr, tpr, thresholds = roc_curve(saps_y, saps_pred)
            interp_tpr = np.interp(interp_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs_saps.append(interp_tpr)
            
            ps_pred=[]
            ps_y=[]
            for i in test:
                if idx[i] in self.pscore:
                    ps_pred.append(self.pscore[idx[i]])
                    ps_y.append(y[i])
            fpr, tpr, thresholds = roc_curve(ps_y, ps_pred)
            interp_tpr = np.interp(interp_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs_ps.append(interp_tpr)
            
            scores_rocauc.append(roc_auc)
            scores_prauc.append(pr_auc)
            scores_mcc.append(mcc)
        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr_pdps = np.mean(tprs_pdps, axis=0)
        mean_tpr_saps = np.mean(tprs_saps, axis=0)
        mean_tpr_ps = np.mean(tprs_ps, axis=0)
        mean_tpr[-1] = 1.0
        mean_tpr_pdps[-1] = 1.0
        mean_tpr_saps[-1] = 1.0
        mean_tpr_ps[-1] = 1.0
        return mean_tpr, mean(scores_rocauc), mean(scores_prauc), mean(scores_mcc), mean_tpr_pdps, mean_tpr_saps, mean_tpr_ps
    
    def under_sampling(self):
        x=self.x
        y=self.y
        idx=self.idx
        x_ture=x[y==True]
        x_false=x[y==False]
        y_ture=y[y==True]
        y_false=y[y==False]
        idx_ture=idx[y==True]
        idx_false=idx[y==False]
        positive_n=len(y_ture)
        negative_n=len(y_false)
        random_index=np.random.randint(0,negative_n,positive_n)  
        x_false_u=x_false[random_index]
        y_false_u=y_false[random_index]
        idx_false_u=idx_false[random_index]
        return np.concatenate([x_ture, x_false_u]), np.concatenate([y_ture, y_false_u]), np.concatenate([idx_ture, idx_false_u])
    
    def run(self):
        self.tprs=[]
        self.rocaucs=[]
        self.praucs=[]
        self.mccs=[]
        self.tprs_pdps=[]
        self.tprs_saps=[]
        self.tprs_ps=[]
        np.random.seed(seed=0)
        for i in range(10):
            x,y,idx=self.under_sampling()
            tpr, rocauc, prauc, mcc, tpr_pdps, tpr_saps, tpr_ps = self.cv(x,y,idx)
            self.tprs.append(tpr)
            self.rocaucs.append(rocauc)
            self.praucs.append(prauc)
            self.mccs.append(mcc)
            self.tprs_pdps.append(tpr_pdps)
            self.tprs_saps.append(tpr_saps)
            self.tprs_ps.append(tpr_ps)
            print("{}/10 Done".format(i+1))

In [None]:
df=pd.read_json("human_reviewed.json") #PhaSePred
df=df.loc["PhaSePred",:]
pdps10={k:df[k]['PdPS-10fea'] for k in df.index}
saps10={k:df[k]['SaPS-10fea'] for k in df.index}
cli_dict={}
with open("../fig1/result/human_client_pscore.txt", "r") as f:
    re=f.read()
    lines=re.split("\n")
    for l in lines:
        if l=="":
            continue
        words=l.split()
        cli_dict[words[2][1:]] = float(words[1])
oth_dict={}
with open("../fig1/result/human_nonllps_pscore.txt", "r") as f:
    re=f.read()
    lines=re.split("\n")
    for l in lines:
        if l=="":
            continue
        words=l.split()
        oth_dict[words[2][1:]] = float(words[1])
sca_dict={}
with open("../fig1/result/human_scaffold_pscore.txt", "r") as f:
    re=f.read()
    lines=re.split("\n")
    for l in lines:
        if l=="":
            continue
        words=l.split()
        sca_dict[words[2][1:]] = float(words[1])
pscore=cli_dict|oth_dict|sca_dict

In [None]:
x=np.array(list_client+list_others)
idx=np.array(client_id+others_id)
y=np.array([True]*len(list_client) + [False]*len(list_others))
model=make_pipeline(StandardScaler(),
                    SVC(class_weight="balanced", probability=True))

In [None]:
svm_display=Display_auc(x,y,model,pdps=pdps10,saps=saps10,pscore=pscore, idx=idx)
svm_display.run()

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
fpr = np.linspace(0, 1, 1000)
for i, tpr in enumerate(svm_display.tprs):
    ax.plot(
    fpr,
    tpr,
    color="gray",
    lw=1,
    alpha=0.3
    )
mean_tpr = np.mean(svm_display.tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc=mean(svm_display.rocaucs)
std_auc=stdev(svm_display.rocaucs)
ax.plot(
    fpr,
    mean_tpr,
    color="b",
    label=r"Seq2Phase (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

aucs=[]
for i, tpr in enumerate(svm_display.tprs_pdps):
    aucs.append(auc(fpr, tpr))
mean_tpr = np.mean(svm_display.tprs_pdps, axis=0)
mean_tpr[-1] = 1.0
mean_auc=mean(aucs)
std_auc=stdev(aucs)
ax.plot(
    fpr,
    mean_tpr,
    color="orangered",
    label=r"PhaSePred-PdPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

aucs=[]
for i, tpr in enumerate(svm_display.tprs_saps):
    aucs.append(auc(fpr, tpr))
mean_tpr = np.mean(svm_display.tprs_saps, axis=0)
mean_tpr[-1] = 1.0
mean_auc=mean(aucs)
std_auc=stdev(aucs)
ax.plot(
    fpr,
    mean_tpr,
    color="limegreen",
    label=r"PhaSePred-SaPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

aucs=[]
for i, tpr in enumerate(svm_display.tprs_ps):
    aucs.append(auc(fpr, tpr))
mean_tpr = np.mean(svm_display.tprs_ps, axis=0)
mean_tpr[-1] = 1.0
mean_auc=mean(aucs)
std_auc=stdev(aucs)
ax.plot(
    fpr,
    mean_tpr,
    color="gold",
    label=r"PScore (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8
)

ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", alpha=0.8)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05]
)
plt.title("Client vs. Non-LLPS", fontsize=20)
ax.set_xlabel("False positive rate", fontsize=13)
ax.set_ylabel("True positive rate", fontsize=13)
ax.legend(loc="lower right", fontsize=12)
plt.savefig("result/figS2.pdf")
plt.show()

In [None]:
print("{:.3f}±{:.3f}".format(mean(svm_display.rocaucs), stdev(svm_display.rocaucs)))
print("{:.3f}±{:.3f}".format(mean(svm_display.praucs), stdev(svm_display.praucs)))
print("{:.3f}±{:.3f}".format(mean(svm_display.mccs), stdev(svm_display.mccs)))

In [None]:
x=np.array(list_scaffold+list_others)
idx=np.array(scaffold_id+others_id)
y=np.array([True]*len(list_scaffold) + [False]*len(list_others))
model=make_pipeline(StandardScaler(), 
                    PCA(n_components=32),
                    StandardScaler(),
                    SVC(class_weight="balanced", probability=True))

In [None]:
svm_display=Display_auc(x,y,model,pdps=pdps10,saps=saps10,pscore=pscore, idx=idx)
svm_display.run()

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
fpr = np.linspace(0, 1, 1000)
for i, tpr in enumerate(svm_display.tprs):
    ax.plot(
    fpr,
    tpr,
    color="gray",
    lw=1,
    alpha=0.3
    )
mean_tpr = np.mean(svm_display.tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc=mean(svm_display.rocaucs)
std_auc=stdev(svm_display.rocaucs)
ax.plot(
    fpr,
    mean_tpr,
    color="b",
    label=r"Seq2Phase (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8
)

aucs=[]
for i, tpr in enumerate(svm_display.tprs_pdps):
    aucs.append(auc(fpr, tpr))
mean_tpr = np.mean(svm_display.tprs_pdps, axis=0)
mean_tpr[-1] = 1.0
mean_auc=mean(aucs)
std_auc=stdev(aucs)
ax.plot(
    fpr,
    mean_tpr,
    color="orangered",
    label=r"PhaSePred-PdPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8
)

aucs=[]
for i, tpr in enumerate(svm_display.tprs_saps):
    aucs.append(auc(fpr, tpr))
mean_tpr = np.mean(svm_display.tprs_saps, axis=0)
mean_tpr[-1] = 1.0
mean_auc=mean(aucs)
std_auc=stdev(aucs)
ax.plot(
    fpr,
    mean_tpr,
    color="limegreen",
    label=r"PhaSePred-SaPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8
)

aucs=[]
for i, tpr in enumerate(svm_display.tprs_ps):
    aucs.append(auc(fpr, tpr))
mean_tpr = np.mean(svm_display.tprs_ps, axis=0)
mean_tpr[-1] = 1.0
mean_auc=mean(aucs)
std_auc=stdev(aucs)
ax.plot(
    fpr,
    mean_tpr,
    color="gold",
    label=r"PScore (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8
)

ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", alpha=0.8)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05]
)
plt.title("Scaffold vs. Non-LLPS", fontsize=20)
ax.set_xlabel("False positive rate", fontsize=13)
ax.set_ylabel("True positive rate", fontsize=13)
ax.legend(loc="lower right", fontsize=12)
plt.savefig("result/fig2b.pdf")
plt.show()

In [None]:
print("{:.3f}±{:.3f}".format(mean(svm_display.rocaucs), stdev(svm_display.rocaucs)))
print("{:.3f}±{:.3f}".format(mean(svm_display.praucs), stdev(svm_display.praucs)))
print("{:.3f}±{:.3f}".format(mean(svm_display.mccs), stdev(svm_display.mccs)))

In [None]:
x=np.array(list_scaffold+list_client)
idx=np.array(scaffold_id+client_id)
y=np.array([True]*len(list_scaffold) + [False]*len(list_client))
model=make_pipeline(StandardScaler(), 
                    PCA(n_components=32),
                    StandardScaler(),
                    SVC(class_weight="balanced", probability=True))

In [None]:
svm_display=Display_auc(x,y,model,pdps=pdps10,saps=saps10,pscore=pscore, idx=idx)
svm_display.run()

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
fpr = np.linspace(0, 1, 1000)
for i, tpr in enumerate(svm_display.tprs):
    ax.plot(
    fpr,
    tpr,
    color="gray",
    lw=1,
    alpha=0.3
    )
mean_tpr = np.mean(svm_display.tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc=mean(svm_display.rocaucs)
std_auc=stdev(svm_display.rocaucs)
ax.plot(
    fpr,
    mean_tpr,
    color="b",
    label=r"Seq2Phase (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8
)

aucs=[]
for i, tpr in enumerate(svm_display.tprs_pdps):
    aucs.append(auc(fpr, tpr))
mean_tpr = np.mean(svm_display.tprs_pdps, axis=0)
mean_tpr[-1] = 1.0
mean_auc=mean(aucs)
std_auc=stdev(aucs)
ax.plot(
    fpr,
    mean_tpr,
    color="orangered",
    label=r"PhaSePred-PdPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8
)

aucs=[]
for i, tpr in enumerate(svm_display.tprs_saps):
    aucs.append(auc(fpr, tpr))
mean_tpr = np.mean(svm_display.tprs_saps, axis=0)
mean_tpr[-1] = 1.0
mean_auc=mean(aucs)
std_auc=stdev(aucs)
ax.plot(
    fpr,
    mean_tpr,
    color="limegreen",
    label=r"PhaSePred-SaPS (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8
)

aucs=[]
for i, tpr in enumerate(svm_display.tprs_ps):
    aucs.append(auc(fpr, tpr))
mean_tpr = np.mean(svm_display.tprs_ps, axis=0)
mean_tpr[-1] = 1.0
mean_auc=mean(aucs)
std_auc=stdev(aucs)
ax.plot(
    fpr,
    mean_tpr,
    color="gold",
    label=r"PScore (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8
)

ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", alpha=0.8)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05]
)
plt.title("Scaffold vs. Client", fontsize=20)
ax.legend(loc="lower right", fontsize=12)
plt.savefig("result/fig2c.pdf")
plt.show()