In [None]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from Bio import SeqIO
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
import random
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from statistics import stdev, variance, mean
from sklearn.decomposition import PCA
from scipy import stats

In [None]:
regulator_ids = !tail -n +2 ../fig1/data/DrLLPS_230423.txt |grep "Homo sapiens" | grep "Regulator" | cut -f 2
regulator_set = set(regulator_ids)
len(regulator_set)

In [None]:
client_set=set()
scaffold_set=set()
others_set=set()
for rec in SeqIO.parse("../fig1/result/drllps_client_clstr_Homo_sapiens.fasta", "fasta"):
    client_set.add(rec.id)
for rec in SeqIO.parse("../fig1/result/drllps_scaffold_clstr_Homo_sapiens.fasta", "fasta"):
    scaffold_set.add(rec.id)
for rec in SeqIO.parse("../fig1/result/drllps_nonllps_clstr_Homo_sapiens.fasta", "fasta"):
    others_set.add(rec.id)
    
mat=np.load("../fig2/PTT5XLU50_human.npy", allow_pickle=True)
mat=mat.item()

list_client=[]
list_others=[]
list_scaffold=[]
list_regulator=[]
client_id=[]
others_id=[]
scaffold_id=[]
regulator_id=[]
for k in mat.keys():
    if k in others_set:
        list_others.append(mat[k])
        others_id.append(k)
    elif k in client_set:
        list_client.append(mat[k])
        client_id.append(k)
    elif k in scaffold_set:
        list_scaffold.append(mat[k])
        scaffold_id.append(k)
    elif k.split("|")[1] in regulator_set:
        list_regulator.append(mat[k])
        regulator_id.append(k)

In [None]:
def under_sampling(x, y, idx):
    x_ture=x[y==True]
    x_false=x[y==False]
    y_ture=y[y==True]
    y_false=y[y==False]
    idx_ture=idx[y==True]
    idx_false=idx[y==False]
    positive_n=len(y_ture)
    negative_n=len(y_false)
    random_index=np.random.randint(0,negative_n,positive_n)  
    x_false_u=x_false[random_index]
    y_false_u=y_false[random_index]
    idx_false_u=idx_false[random_index]
    return np.concatenate([x_ture, x_false_u]), np.concatenate([y_ture, y_false_u]), np.concatenate([idx_ture, idx_false_u])

In [None]:
np.random.seed(0)
x_all=np.array(list_client+list_others)
y_all=np.array([True]*len(list_client) + [False]*len(list_others))
idx_all=np.array(client_id+others_id)
x,y,idx=under_sampling(x_all,y_all,idx_all)
model=make_pipeline(StandardScaler(), SVC(class_weight="balanced", probability=True))

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
for train, test in skf.split(x,y):
    model.fit(x[train], y[train])
    
    id_train=idx[train]
    x_test=x[test]
    y_test=y[test]
    idx_test=idx[test]
    x_test_t=x_test[y_test==True]
    idx_test_t=idx_test[y_test==True]
    
    non_no_train=others_set - set(id_train)
    x_no_train_f=[]
    idx_no_tran_f=[]
    for k in mat.keys():
        if k in non_no_train:
            x_no_train_f.append(mat[k])
            idx_no_tran_f.append(k)
    x_no_train_f=np.array(x_no_train_f)
    
    y_test_t_pred=model.predict_proba(x_test_t)
    y_no_train_f_pred=model.predict_proba(x_no_train_f)
    break

In [None]:
y_reg=model.predict_proba(np.array(list_regulator))
y_sca=model.predict_proba(np.array(list_scaffold))

In [None]:
print(stats.mannwhitneyu(y_reg[:,1], y_test_t_pred[:,1], alternative='two-sided'))
print(stats.mannwhitneyu(y_reg[:,1], y_no_train_f_pred[:,1], alternative='two-sided'))
print(stats.mannwhitneyu(y_reg[:,1], y_sca[:,1], alternative='two-sided'))

In [None]:
df_cli=pd.DataFrame(y_test_t_pred[:,1], index=idx_test_t)
df_non=pd.DataFrame(y_no_train_f_pred[:,1], index=idx_no_tran_f)
df_reg=pd.DataFrame(y_reg[:,1], index=regulator_id)
df_sca=pd.DataFrame(y_sca[:,1], index=scaffold_id)

In [None]:
df_sca

In [None]:
df_cli["Label"]="Client"
df_non["Label"]="Non-LLPS"
df_reg["Label"]="Regulator"
df_sca["Label"]="Scaffold"
df=pd.concat([df_reg, df_cli, df_sca, df_non])

In [None]:
fig = plt.figure(figsize=(6,4))
sns.set_palette(["tomato", "orange", "mediumpurple", "lightgray"])
sns.violinplot(data=df, y=0, x="Label", cut=0)
plt.ylabel("Client score")
plt.xlabel("")
plt.ylim([-0.03, 1.15])
plt.savefig("result/fig6c.pdf")
plt.show()

In [None]:
np.random.seed(0)
x_all=np.array(list_scaffold+list_others)
y_all=np.array([True]*len(list_scaffold) + [False]*len(list_others))
idx_all=np.array(scaffold_id+others_id)
x,y,idx=under_sampling(x_all,y_all,idx_all)
model_sca=make_pipeline(StandardScaler(), SVC(class_weight="balanced", probability=True))

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
for train, test in skf.split(x,y):
    model_sca.fit(x[train], y[train])
    
    id_train=idx[train]
    x_test=x[test]
    y_test=y[test]
    idx_test=idx[test]
    x_test_t=x_test[y_test==True]
    idx_test_t=idx_test[y_test==True]
    
    non_no_train=others_set - set(id_train)
    x_no_train_f=[]
    idx_no_tran_f=[]
    for k in mat.keys():
        if k in non_no_train:
            x_no_train_f.append(mat[k])
            idx_no_tran_f.append(k)
    x_no_train_f=np.array(x_no_train_f)
    
    y_test_t_pred=model_sca.predict_proba(x_test_t)
    y_no_train_f_pred=model_sca.predict_proba(x_no_train_f)
    break

In [None]:
y_reg=model_sca.predict_proba(np.array(list_regulator))
y_cli=model_sca.predict_proba(np.array(list_client))

In [None]:
print(stats.mannwhitneyu(y_reg[:,1], y_test_t_pred[:,1], alternative='two-sided'))
print(stats.mannwhitneyu(y_reg[:,1], y_no_train_f_pred[:,1], alternative='two-sided'))
print(stats.mannwhitneyu(y_reg[:,1], y_cli[:,1], alternative='two-sided'))

In [None]:
df_sca=pd.DataFrame(y_test_t_pred[:,1], index=idx_test_t)
df_non=pd.DataFrame(y_no_train_f_pred[:,1], index=idx_no_tran_f)
df_reg=pd.DataFrame(y_reg[:,1], index=regulator_id)
df_cli=pd.DataFrame(y_cli[:,1], index=client_id)

In [None]:
df_cli["Label"]="Client"
df_non["Label"]="Non-LLPS"
df_reg["Label"]="Regulator"
df_sca["Label"]="Scaffold"
df_sca_score=pd.concat([df_reg, df_cli, df_sca, df_non])

In [None]:
fig = plt.figure(figsize=(6,4))
sns.violinplot(data=df_sca_score, y=0, x="Label", cut=0, )
plt.ylabel("Scaffold score")
plt.xlabel("")
plt.ylim([-0.03, 1.15])
plt.savefig("result/fig6d.pdf")
plt.show()