In [None]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import seaborn as sns
import random
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from statistics import stdev, variance, mean
from sklearn.decomposition import PCA

In [None]:
client_set=set()
others_set=set()
scaffold_set=set()
for rec in SeqIO.parse("../fig1/result/drllps_client_clstr_Homo_sapiens.fasta", "fasta"):
    client_set.add(rec.id)
for rec in SeqIO.parse("../fig1/result/drllps_nonllps_clstr_Homo_sapiens.fasta", "fasta"):
    others_set.add(rec.id)
for rec in SeqIO.parse("../fig1/result/drllps_scaffold_clstr_Homo_sapiens.fasta", "fasta"):
    scaffold_set.add(rec.id)
    
mat=np.load("../fig2/PTT5XLU50_human.npy", allow_pickle=True)
mat=mat.item()

list_client=[]
list_others=[]
list_scaffold=[]
client_ids=[]
scaffold_ids=[]
nonllps_ids=[]
for k in mat.keys():
    if k in others_set:
        list_others.append(mat[k])
        nonllps_ids.append(k)
    elif k in client_set:
        list_client.append(mat[k])
        client_ids.append(k)
    elif k in scaffold_set:
        list_scaffold.append(mat[k])
        scaffold_ids.append(k)

In [None]:
def under_sampling(x, y):
    x_ture=x[y==True]
    x_false=x[y==False]
    y_ture=y[y==True]
    y_false=y[y==False]
    positive_n=len(y_ture)
    negative_n=len(y_false)
    random_index=np.random.randint(0,negative_n,positive_n)  
    x_false_u=x_false[random_index]
    y_false_u=y_false[random_index]
    return np.concatenate([x_ture, x_false_u]), np.concatenate([y_ture, y_false_u])

In [None]:
np.random.seed(0)
x_all=np.array(list_client+list_others)
y_all=np.array([True]*len(list_client) + [False]*len(list_others))
idx_all=np.array(client_ids+nonllps_ids)
model_client=make_pipeline(StandardScaler(), SVC(class_weight="balanced", probability=True))

In [None]:
x_sca=np.array(list_scaffold)
idx_sca=np.array(scaffold_ids)
idx_cli=[]
idx_non=[]
cli_score_cli=[]
cli_score_non=[]
cli_score_sca=[]
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
for train, test in skf.split(x_all,y_all):
    x,y=under_sampling(x_all[train],y_all[train])
    model_client.fit(x, y)
    
    x_test=x_all[test]
    y_test=y_all[test]
    idx_test=idx_all[test]
    x_test_t=x_test[y_test==True]
    x_test_f=x_test[y_test==False]
    idx_test_t=idx_test[y_test==True]
    idx_test_f=idx_test[y_test==False]
    
    y_pred_client=model_client.predict_proba(x_test_t)[:,1]
    y_pred_non=model_client.predict_proba(x_test_f)[:,1]
    y_pred_sca=model_client.predict_proba(x_sca)[:,1]
    cli_score_cli.append(y_pred_client)
    cli_score_non.append(y_pred_non)
    cli_score_sca.append(y_pred_sca)
    idx_cli.append(idx_test_t)
    idx_non.append(idx_test_f)
cli_score_cli=np.concatenate(cli_score_cli)
cli_score_non=np.concatenate(cli_score_non)
cli_score_sca=np.mean(np.array(cli_score_sca), axis=0)
idx_cli=np.concatenate(idx_cli)
idx_non=np.concatenate(idx_non)

cli_score_cli={k:v for k,v in zip(idx_cli, cli_score_cli)}
cli_score_non={k:v for k,v in zip(idx_non, cli_score_non)}
cli_score_sca={k:v for k,v in zip(idx_sca, cli_score_sca)}

In [None]:
print(len(cli_score_cli), len(cli_score_non), len(cli_score_sca))

In [None]:
x_all=np.array(list_scaffold+list_others)
y_all=np.array([True]*len(list_scaffold) + [False]*len(list_others))
idx_all=np.array(scaffold_ids+nonllps_ids)
model_scaffold=make_pipeline(StandardScaler(), 
                    PCA(n_components=32),
                    StandardScaler(),
                    SVC(class_weight="balanced", probability=True))

In [None]:
np.random.seed(0)
x_cli=np.array(list_client)
idx_cli_=np.array(client_ids)
indices=np.arange(len(list_client))
np.random.shuffle(indices)
indices_split = np.array_split(indices, 5)

np.random.seed(0)
idx_sca=[]
idx_non=[]
idx_cli=[]
sca_score_cli=[]
sca_score_non=[]
sca_score_sca=[]
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
i=0
for train, test in skf.split(x_all,y_all):
    x_cli_test=x_cli[indices_split[i]]
    idx_cli_test=idx_cli_[indices_split[i]]
    print("# of cli_test", len(x_cli_test))
    indices_other_parts = [part for j, part in enumerate(indices_split) if j != i]
    indices_other_elements = np.concatenate(indices_other_parts)
    x_cli_train=x_cli[indices_other_elements]
    idx_cli_train=idx_cli_[indices_other_elements]
    print("# of cli_train", len(x_cli_train))
    
    x,y=under_sampling(x_all[train],y_all[train])
    arr = np.random.choice(len(x_cli_train), len(x)//2, replace=False)
    x_cli_selected=x_cli_train[arr]
    print("# of added client", len(arr))
    x=np.concatenate([x,x_cli_selected])
    y=np.concatenate([y, np.array([False]*len(x_cli_selected))])
    print("# of train", len(x))
    model_scaffold.fit(x, y)
    
    x_test=x_all[test]
    y_test=y_all[test]
    idx_test=idx_all[test]
    x_test_t=x_test[y_test==True]
    x_test_f=x_test[y_test==False]
    idx_test_t=idx_test[y_test==True]
    idx_test_f=idx_test[y_test==False]
    
    y_pred_sca=model_scaffold.predict_proba(x_test_t)[:,1]
    y_pred_non=model_scaffold.predict_proba(x_test_f)[:,1]
    y_pred_cli=model_scaffold.predict_proba(x_cli_test)[:,1]
    sca_score_sca.append(y_pred_sca)
    sca_score_non.append(y_pred_non)
    sca_score_cli.append(y_pred_cli)
    idx_sca.append(idx_test_t)
    idx_non.append(idx_test_f)
    idx_cli.append(idx_cli_test)
    i+=1
sca_score_sca=np.concatenate(sca_score_sca)
sca_score_non=np.concatenate(sca_score_non)
sca_score_cli=np.concatenate(sca_score_cli)
idx_sca=np.concatenate(idx_sca)
idx_non=np.concatenate(idx_non)
idx_cli=np.concatenate(idx_cli)

sca_score_cli={k:v for k,v in zip(idx_cli, sca_score_cli)}
sca_score_non={k:v for k,v in zip(idx_non, sca_score_non)}
sca_score_sca={k:v for k,v in zip(idx_sca, sca_score_sca)}

In [None]:
print(len(sca_score_cli), len(sca_score_non), len(sca_score_sca))

In [None]:
df_score_sca=pd.DataFrame(list(sca_score_sca.values()), index=sca_score_sca.keys(), columns=['Scaffold'])
df_score_sca['Client'] = pd.Series(cli_score_sca)
df_score_cli=pd.DataFrame(list(sca_score_cli.values()), index=sca_score_cli.keys(), columns=['Scaffold'])
df_score_cli['Client'] = pd.Series(cli_score_cli)
df_score_non=pd.DataFrame(list(sca_score_non.values()), index=sca_score_non.keys(), columns=['Scaffold'])
df_score_non['Client'] = pd.Series(cli_score_non)

In [None]:
df_score_sca["Label"]="Scaffold"
df_score_cli["Label"]="Client"
df_score_non["Label"]="Non-LLPS"

In [None]:
fig = plt.figure(figsize=(6,6))
plt.scatter(x=df_score_non.loc[:,"Client"], y=df_score_non.loc[:,"Scaffold"], c="gray", label="Non-LLPS", s=0.5)
plt.scatter(x=df_score_cli.loc[:,"Client"], y=df_score_cli.loc[:,"Scaffold"], c="orange", label="Client", s=0.8)
plt.scatter(x=df_score_sca.loc[:,"Client"], y=df_score_sca.loc[:,"Scaffold"], c="b", label="Scaffold", s=2)

plt.legend()
plt.xlabel("Client score")
plt.ylabel("Scaffold score")
plt.savefig("result/fig6a.pdf")
plt.show()

In [None]:
fig = plt.figure(figsize=(6,6))
df_all=pd.concat([df_score_non, df_score_cli, df_score_sca])
ax=sns.kdeplot(x=df_all.Client, y=df_all.Scaffold, hue=df_all.Label, 
                 common_norm=False, palette={"Client":"darkorange", "Scaffold":"blueviolet", "Non-LLPS":"lightgrey"},
           label=["Non-LLPS", "Client", "Scaffold"])
sns.move_legend(ax, "upper left")
plt.xlabel("Client score")
plt.ylabel("Scaffold score")
plt.xlim([-0.19,1.199])
plt.ylim([-0.19,1.199])
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.savefig("result/fig6b.pdf")
plt.show()

In [None]:
fig = plt.figure(figsize=(5,.8))
sns.kdeplot(x=df_all.Client, hue=df_all.Label, palette={"Client":"darkorange", "Scaffold":"blueviolet", "Non-LLPS":"lightgrey"},
           common_norm=False, fill=True,legend=False)
plt.xlabel("")
plt.ylabel("")
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.yticks([])
plt.savefig("result/fig6bx.pdf")
plt.show()

In [None]:
fig = plt.figure(figsize=(.8,5))
sns.kdeplot(y=df_all.Scaffold, hue=df_all.Label, palette={"Client":"darkorange", "Scaffold":"blueviolet", "Non-LLPS":"lightgrey"},
           common_norm=False, fill=True,legend=False)
plt.xlabel("")
plt.ylabel("")
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.xticks([])
plt.savefig("result/fig6by.pdf")
plt.show()