In [None]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from statistics import stdev, variance, mean
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

In [None]:
others_set=set()
scaffold_set=set()
for rec in SeqIO.parse("../fig1/result/drllps_nonllps_clstr_Homo_sapiens.fasta", "fasta"):
    others_set.add(rec.id)
for rec in SeqIO.parse("../fig1/result/drllps_scaffold_clstr_Homo_sapiens.fasta", "fasta"):
    scaffold_set.add(rec.id)
    
mat=np.load("PTT5XLU50_human.npy", allow_pickle=True)
mat=mat.item()

list_others=[]
list_scaffold=[]
scaffold_ids=[]
nonllps_ids=[]
for k in mat.keys():
    if k in others_set:
        list_others.append(mat[k])
        nonllps_ids.append(k.split("|")[1])
    elif k in scaffold_set:
        list_scaffold.append(mat[k])
        scaffold_ids.append(k.split("|")[1])

In [None]:
def under_sampling(x, y):
    x_ture=x[y==True]
    x_false=x[y==False]
    y_ture=y[y==True]
    y_false=y[y==False]
    positive_n=len(y_ture)
    negative_n=len(y_false)
    random_index=np.random.randint(0,negative_n,positive_n)  
    x_false_u=x_false[random_index]
    y_false_u=y_false[random_index]
    return np.concatenate([x_ture, x_false_u]), np.concatenate([y_ture, y_false_u])

In [None]:
x_all=np.array(list_scaffold+list_others)
y_all=np.array([True]*len(list_scaffold) + [False]*len(list_others))

In [None]:
np.random.seed(0)
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
scores=[0]*10
for j in range(10):
    x,y=under_sampling(x_all,y_all)
    model=make_pipeline(StandardScaler(), SVC(class_weight="balanced", probability=True))
    aucs=[]
    for train, test in skf.split(x,y):
        model.fit(x[train], y[train])
        auc=roc_auc_score(y[test], model.predict_proba(x[test])[:,1])
        aucs.append(auc)
    mean_auc=mean(aucs)
    scores[j]=mean_auc
print("AUC:{:.3f}±{:.3f}".format(mean(scores), stdev(scores)))

In [None]:
pca_mean_auc=[]
pca_sd_auc=[]
for i in [64, 32, 16, 8, 4, 2, 1]:
    scores=[0]*10
    np.random.seed(0)
    for j in range(10):
        x,y=under_sampling(x_all,y_all)
        model=make_pipeline(StandardScaler(), SVC(class_weight="balanced", probability=True))
        aucs=[]
        for train, test in skf.split(x,y):
            pca = PCA(n_components=i)
            sc=StandardScaler()
            x_reduced_train = pca.fit_transform(sc.fit_transform(x[train]))
            x_reduced_test = pca.transform(sc.transform(x[test]))

            model.fit(x_reduced_train, y[train])
            auc=roc_auc_score(y[test], model.predict_proba(x_reduced_test)[:,1])
            aucs.append(auc)
        mean_auc=mean(aucs)
        scores[j]=mean_auc
    mean_auc=mean(scores)
    sd_auc=stdev(scores)
    pca_mean_auc.append(mean_auc)
    pca_sd_auc.append(sd_auc)
    print("n_components:{}, AUC:{:.3f}±{:.3f}".format(i, mean_auc, sd_auc))

In [None]:
kpca_mean_auc=[]
kpca_sd_auc=[]
gamma=0.0001
for i in [64, 32, 16, 8, 4, 2, 1]:
    scores=[0]*10
    np.random.seed(0)
    for j in range(10):
        x,y=under_sampling(x_all, y_all)
        model=make_pipeline(StandardScaler(), SVC(class_weight="balanced", probability=True))
        aucs=[]
        for train, test in skf.split(x,y):
            kpca = KernelPCA(n_components=i, kernel="rbf", gamma=gamma, n_jobs=64)
            sc=StandardScaler()
            x_reduced_train = kpca.fit_transform(sc.fit_transform(x[train]))
            x_reduced_test = kpca.transform(sc.transform(x[test]))

            model.fit(x_reduced_train, y[train])
            auc=roc_auc_score(y[test], model.predict_proba(x_reduced_test)[:,1])
            aucs.append(auc)
        mean_auc=mean(aucs)
        scores[j]=mean_auc
    mean_auc=mean(scores)
    sd_auc=stdev(scores)
    kpca_mean_auc.append(mean_auc)
    kpca_sd_auc.append(sd_auc)
    print("n_components:{}, AUC:{:.3f}±{:.3f}".format(i, mean_auc, sd_auc))

In [None]:
class FeatureSelectorCC:
    def __init__(self, n_components):
        self.n_components = n_components
        self.selected_columns = None

    def fit(self, x, y):
        df = pd.DataFrame(x)
        df['target'] = y
        correlation_matrix = df.corr().abs()
        correlations = correlation_matrix['target'].drop('target')
        self.selected_columns = correlations.nlargest(self.n_components).index.tolist()
        return self

    def transform(self, x):
        df = pd.DataFrame(x)
        return df[self.selected_columns].values

In [None]:
corr_mean_auc=[]
corr_sd_auc=[]
for i in [64, 32, 16, 8, 4, 2, 1]:
    scores=[0]*10
    np.random.seed(0)
    for j in range(10):
        x,y=under_sampling(x_all,y_all)
        model=make_pipeline(StandardScaler(), SVC(class_weight="balanced", probability=True))
        aucs=[]
        for train, test in skf.split(x,y):
            fs = FeatureSelectorCC(n_components=i)
            fs.fit(x[train], y[train])
            x_reduced_train = fs.transform(x[train])
            x_reduced_test = fs.transform(x[test])

            model.fit(x_reduced_train, y[train])
            auc=roc_auc_score(y[test], model.predict_proba(x_reduced_test)[:,1])
            aucs.append(auc)
        mean_auc=mean(aucs)
        scores[j]=mean_auc
    mean_auc=mean(scores)
    sd_auc=stdev(scores)
    corr_mean_auc.append(mean_auc)
    corr_sd_auc.append(sd_auc)
    print("n_components:{}, AUC:{:.3f}±{:.3f}".format(i, mean_auc, sd_auc))

In [None]:
from sklearn.feature_selection import mutual_info_regression

class FeatureSelectorMI:
    def __init__(self, n_components):
        self.n_components = n_components
        self.selected_columns = None

    def fit(self, x, y):
        mutual_infos = mutual_info_regression(x, y)
        self.selected_columns = np.argsort(mutual_infos)[-self.n_components:]
        return self

    def transform(self, x):
        return x[:, self.selected_columns]

In [None]:
mi_mean_auc=[]
mi_sd_auc=[]
for i in [64, 32, 16, 8, 4, 2, 1]:
    scores=[0]*10
    np.random.seed(0)
    for j in range(10):
        x,y=under_sampling(x_all,y_all)
        model=make_pipeline(StandardScaler(), SVC(class_weight="balanced", probability=True))
        aucs=[]
        for train, test in skf.split(x,y):
            mi = FeatureSelectorMI(n_components=i)
            mi.fit(x[train], y[train])
            x_reduced_train = mi.transform(x[train])
            x_reduced_test = mi.transform(x[test])

            model.fit(x_reduced_train, y[train])
            auc=roc_auc_score(y[test], model.predict_proba(x_reduced_test)[:,1])
            aucs.append(auc)
        mean_auc=mean(aucs)
        scores[j]=mean_auc
    mean_auc=mean(scores)
    sd_auc=stdev(scores)
    mi_mean_auc.append(mean_auc)
    mi_sd_auc.append(sd_auc)
    print("n_components:{}, AUC:{:.3f}±{:.3f}".format(i, mean_auc, sd_auc))

In [None]:
plt.axhline(y=0.895, linestyle='dashed', color='grey')
plt.plot(range(7), pca_mean_auc, '-o', label='PCA')
plt.plot(range(7), kpca_mean_auc, '-^', label='KPCA')
plt.plot(range(7), corr_mean_auc, '-x', label='CC')
plt.plot(range(7), mi_mean_auc, '-+', label='MI')
plt.ylabel("ROC AUC", fontsize=14)
plt.xlabel("#Dimentions", fontsize=14)
plt.xticks(range(7), [64, 32, 16, 8, 4, 2, 1])
plt.legend(fontsize=14)
plt.savefig("result/figS3.pdf")
plt.show()