In [None]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import random
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from statistics import stdev, variance, mean
from sklearn.decomposition import PCA

In [None]:
client_set=set()
scaffold_set=set()
others_set=set()
for rec in SeqIO.parse("../fig1/result/drllps_client_clstr_Homo_sapiens.fasta", "fasta"):
    client_set.add(rec.id)
for rec in SeqIO.parse("../fig1/result/drllps_scaffold_clstr_Homo_sapiens.fasta", "fasta"):
    scaffold_set.add(rec.id)
for rec in SeqIO.parse("../fig1/result/drllps_nonllps_clstr_Homo_sapiens.fasta", "fasta"):
    others_set.add(rec.id)
    
mat=np.load("../fig2/PTT5XLU50_human.npy", allow_pickle=True)
mat=mat.item()

list_client=[]
list_others=[]
list_scaffold=[]
client_id=[]
others_id=[]
scaffold_id=[]
for k in mat.keys():
    if k in others_set:
        list_others.append(mat[k])
        others_id.append(k)
    elif k in client_set:
        list_client.append(mat[k])
        client_id.append(k)
    elif k in scaffold_set:
        list_scaffold.append(mat[k])
        scaffold_id.append(k)

In [None]:
def under_sampling(x, y, idx):
    x_ture=x[y==True]
    x_false=x[y==False]
    y_ture=y[y==True]
    y_false=y[y==False]
    idx_ture=idx[y==True]
    idx_false=idx[y==False]
    positive_n=len(y_ture)
    negative_n=len(y_false)
    random_index=np.random.randint(0,negative_n,positive_n)  
    x_false_u=x_false[random_index]
    y_false_u=y_false[random_index]
    idx_false_u=idx_false[random_index]
    return np.concatenate([x_ture, x_false_u]), np.concatenate([y_ture, y_false_u]), np.concatenate([idx_ture, idx_false_u])

In [None]:
np.random.seed(0)
x_all=np.array(list_client+list_others)
y_all=np.array([True]*len(list_client) + [False]*len(list_others))
idx_all=np.array(client_id+others_id)
x,y,idx=under_sampling(x_all,y_all,idx_all)
model=make_pipeline(StandardScaler(), SVC(class_weight="balanced", probability=True))

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
for train, test in skf.split(x,y):
    model.fit(x[train], y[train])
    
    id_train=idx[train]
    x_test=x[test]
    y_test=y[test]
    idx_test=idx[test]
    x_test_t=x_test[y_test==True]
    idx_test_t=idx_test[y_test==True]
    
    non_no_train=others_set - set(id_train)
    x_no_train_f=[]
    idx_no_tran_f=[]
    for k in mat.keys():
        if k in non_no_train:
            x_no_train_f.append(mat[k])
            idx_no_tran_f.append(k)
    x_no_train_f=np.array(x_no_train_f)
    
    y_test_t_pred=model.predict_proba(x_test_t)
    y_no_train_f_pred=model.predict_proba(x_no_train_f)
    break

In [None]:
f_pred_sorted_idx=np.argsort(y_no_train_f_pred[:,1])[::-1]
t_pred_sorted_idx=np.argsort(y_test_t_pred[:,1])[::-1]
non_id_sorted=np.array(idx_no_tran_f)[f_pred_sorted_idx]
cli_id_sorted=idx_test_t[t_pred_sorted_idx]

In [None]:
non_id_sorted=[i.split("|")[1] for i in non_id_sorted]
cli_id_sorted=[i.split("|")[1] for i in cli_id_sorted]

In [None]:
non_id_sorted[:10]

In [None]:
cli_id_sorted[:10]

In [None]:
mat=np.load("data/PTT5XLU50_human_aa.npy", allow_pickle=True)
mat=mat.item()

client_like_client_set=set(cli_id_sorted[:3])
client_like_nonllps_set=set(non_id_sorted[:3])
list_client_mat=[]
list_non_mat=[]
client_like_client=[]
client_like_nonllps=[]
#client_id=set(client_id[1000:])
for k in mat.keys():
    k_id=k.split("|")[1]
    if k_id in client_like_client_set:
        list_client_mat.append(mat[k])
        client_like_client.append(k_id)
    elif k_id in client_like_nonllps_set:
        list_non_mat.append(mat[k])
        client_like_nonllps.append(k_id)
del mat

In [None]:
def sliding_window_avg(arr, window_size):
    if len(arr.shape) != 2:
        raise ValueError("Input must be a two-dimensional array")

    if window_size > arr.shape[0]:
        raise ValueError("Window size cannot be larger than the array length")

    result = np.zeros_like(arr)
    pad_size = window_size // 2
    for i in range(arr.shape[0]):
        start = max(0, i - pad_size)
        end = min(arr.shape[0], i + pad_size + 1)
        result[i] = np.mean(arr[start:end], axis=0)
    return result

In [None]:
y_arrs={}
for mat, protein_id in zip(list_client_mat, client_like_client):
    x_arr=sliding_window_avg(mat, 100)
    y_arr=model.predict_proba(x_arr)
    y_arrs[protein_id]=y_arr[:,1]
    plt.axhline(y=0.5, linestyle='dashed', color='grey')
    plt.plot(y_arr[:,1])
    plt.ylim([0.2,0.8])
    plt.ylabel("Client score")
    plt.xlabel("Sequence")
    plt.title(protein_id)
    plt.savefig("result/fig7_"+protein_id+".pdf")
    plt.show()

In [None]:
for mat, protein_id in zip(list_non_mat, client_like_nonllps):
    x_arr=sliding_window_avg(mat, 100)
    y_arr=model.predict_proba(x_arr)
    y_arrs[protein_id]=y_arr[:,1]
    plt.axhline(y=0.5, linestyle='dashed', color='grey')
    plt.plot(y_arr[:,1])
    plt.ylim([0.2,0.8])
    plt.ylabel("Client score")
    plt.xlabel("Sequence")
    plt.title(protein_id)
    plt.savefig("result/fig7_"+protein_id+".pdf")
    plt.show()

In [None]:
for k in y_arrs:
    indices = np.where(y_arrs[k] >= 0.5)
    indices_1=[i+1 for i in indices[0]]
    start=indices_1[0]
    j=indices_1[0]
    s=str(start)
    for i in indices_1:
        if i == start:
            continue
        if j+1 == i:
            j=i
            continue
        elif start==j:
            s=s+"+"+str(j)+"+"+str(i)
            start=i
            j=i
        else:
            s=s+"-"+str(j)+"+"+str(i)
            start=i
            j=i
    if start==j:
        s=s+"+"+str(j)
        start=i
        j=i
    else:
        s=s+"-"+str(j)
        start=i
        j=i
    print(k)
    print(s)