In [None]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import random
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import matthews_corrcoef, make_scorer
from statistics import stdev, variance, mean
from sklearn.decomposition import PCA
import torch
from torch import nn, optim
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
class EarlyStopping:
    def __init__(self, patience=5, verbose=False, path='checkpoint_model.pth'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.path = path

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:  
            self.best_score = score   
            self.checkpoint(val_loss, model)  
        elif score <= self.best_score:  
            self.counter += 1   
            if self.verbose:  
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')   
            if self.counter >= self.patience:  
                self.early_stop = True
        else:  
            self.best_score = score  
            self.checkpoint(val_loss, model)  
            self.counter = 0  
            
    def checkpoint(self, val_loss, model):
        if self.verbose:  
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)  
        self.val_loss_min = val_loss  
        
def training_loop(n_epochs, optimizer, model, loss, mask_train, x_train,  y_train):
    loss=loss
    
    n_samples=x_train.shape[0]
    n_val=int(n_samples*0.2)

    shuffled_ind=torch.randperm(n_samples)

    train_ind=shuffled_ind[:-n_val] 
    val_ind=shuffled_ind[-n_val:]
    
    x_val=x_train[val_ind]
    y_val=y_train[val_ind]
    
    x_train=x_train[train_ind]
    y_train=y_train[train_ind]
    
    x_train=x_train
    y_train=y_train
    
    x_val=x_val
    y_val=y_val

    patience=10
    earlystopping = EarlyStopping(patience=patience, verbose=False)
    for epoch in range(1, n_epochs+1):
        model.train()
        
        y_train_pred=model.forward(x_train)
        loss_train=loss(y_train_pred, y_train)
        
        model.eval()
        with torch.no_grad():
            y_val_pred=model.forward(x_val)
            loss_val=loss(y_val_pred, y_val)

        earlystopping(loss_val, model) 
        if earlystopping.early_stop: 
            break
            
        optimizer.zero_grad()
        loss_train.backward()
        optimizer.step()
        
class FNN2(nn.Module):
    def __init__(self, embeddings_dim=1024, dropout=0.25):
        super(FNN2, self).__init__()

        self.linear = nn.Sequential(
            nn.Linear(embeddings_dim, 32),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(32,2)
        )


    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
        o = self.linear(x)  
        return o
    

class NN(BaseEstimator, ClassifierMixin):
    def __init__(self, n_epochs=500, lr=0.03):
        self.n_epochs = n_epochs
        self.lr = lr
        self.model = None
        self.optim = None
        self.loss = nn.CrossEntropyLoss()

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        X_tensor = torch.tensor(X, dtype=torch.float)
        y_tensor = torch.tensor(y, dtype=torch.long)
        n_dim = X_tensor.shape[1]
        self.model=FNN2(embeddings_dim=n_dim)
        self.optim = optim.Adam(self.model.parameters(), lr=self.lr)
        training_loop(
            n_epochs=self.n_epochs,
            optimizer=self.optim,
            model=self.model,
            loss=self.loss,
            mask_train=None,
            x_train=X_tensor,
            y_train=y_tensor,
        )
        return self

    def predict(self, X):
        with torch.no_grad():
            X_tensor = torch.tensor(X, dtype=torch.float)
            self.model.eval()
            y_pred = self.model(X_tensor)
            _, predicted = torch.max(y_pred, 1)
            return predicted.numpy()

    def predict_proba(self, X):
        with torch.no_grad():
            X_tensor = torch.tensor(X, dtype=torch.float)
            self.model.eval()
            y_pred = self.model(X_tensor)
            probas = nn.Softmax(dim=1)(y_pred)
            return probas.numpy()

In [None]:
client_set=set()
others_set=set()
scaffold_set=set()
for rec in SeqIO.parse("../fig1/result/drllps_client_clstr_Homo_sapiens.fasta", "fasta"):
    client_set.add(rec.id)
for rec in SeqIO.parse("../fig1/result/drllps_nonllps_clstr_Homo_sapiens.fasta", "fasta"):
    others_set.add(rec.id)
for rec in SeqIO.parse("../fig1/result/drllps_scaffold_clstr_Homo_sapiens.fasta", "fasta"):
    scaffold_set.add(rec.id)
    
mat=np.load("../fig2/embedding/PTT5XLU50_human.npy", allow_pickle=True)
mat=mat.item()

list_client=[]
list_others=[]
list_scaffold=[]
client_ids=[]
scaffold_ids=[]
nonllps_ids=[]
for k in mat.keys():
    if k in others_set:
        list_others.append(mat[k])
        nonllps_ids.append(k)
    elif k in client_set:
        list_client.append(mat[k])
        client_ids.append(k)
    elif k in scaffold_set:
        list_scaffold.append(mat[k])
        scaffold_ids.append(k)

In [None]:
np.random.seed(0)
x_all=np.array(list_client+list_others)
y_all=np.array([True]*len(list_client) + [False]*len(list_others))
idx_all=np.array(client_ids+nonllps_ids)
estimators = [
    ('nn', NN(lr=0.01)),
    ('rf', RandomForestClassifier(max_depth=20, max_features="sqrt", class_weight="balanced",n_estimators=200, n_jobs=40)),
    ('svm', make_pipeline(StandardScaler(), SVC(class_weight="balanced", probability=True, gamma="auto"))),
    ('hgboost', HistGradientBoostingClassifier(learning_rate=0.1, max_leaf_nodes=63, min_samples_leaf=80, class_weight="balanced"))
]
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
model_client=StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(class_weight="balanced"), n_jobs=-1, cv=cv
)

In [None]:
x_sca=np.array(list_scaffold)
idx_sca=np.array(scaffold_ids)
idx_cli=[]
idx_non=[]
cli_score_cli=[]
cli_score_non=[]
cli_score_sca=[]
skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
for train, test in skf.split(x_all,y_all):
    model_client.fit(x_all[train], y_all[train])
    x_test=x_all[test]
    y_test=y_all[test]
    idx_test=idx_all[test]
    x_test_t=x_test[y_test==True]
    x_test_f=x_test[y_test==False]
    idx_test_t=idx_test[y_test==True]
    idx_test_f=idx_test[y_test==False]
    
    y_pred_client=model_client.predict_proba(x_test_t)[:,1]
    y_pred_non=model_client.predict_proba(x_test_f)[:,1]
    y_pred_sca=model_client.predict_proba(x_sca)[:,1]
    cli_score_cli.append(y_pred_client)
    cli_score_non.append(y_pred_non)
    cli_score_sca.append(y_pred_sca)
    idx_cli.append(idx_test_t)
    idx_non.append(idx_test_f)
cli_score_cli=np.concatenate(cli_score_cli)
cli_score_non=np.concatenate(cli_score_non)
cli_score_sca=np.mean(np.array(cli_score_sca), axis=0)
idx_cli=np.concatenate(idx_cli)
idx_non=np.concatenate(idx_non)

cli_score_cli={k:v for k,v in zip(idx_cli, cli_score_cli)}
cli_score_non={k:v for k,v in zip(idx_non, cli_score_non)}
cli_score_sca={k:v for k,v in zip(idx_sca, cli_score_sca)}

In [None]:
print(len(cli_score_cli), len(cli_score_non), len(cli_score_sca))

In [None]:
x_all=np.array(list_scaffold+list_others+list_client)
y_all=np.array([True]*len(list_scaffold) + [False]*len(list_others+list_client))
is_client=np.array([False]*len(list_scaffold+list_others) + [True]*len(list_client))
is_nonllps=np.array([False]*len(list_scaffold) + [True]*len(list_others) + [False]*len(list_client))
idx_all=np.array(scaffold_ids+nonllps_ids+client_ids)
estimators = [
    ('nn', make_pipeline(StandardScaler(), PCA(n_components=128), NN(lr=0.05))),
    ('rf', make_pipeline(StandardScaler(), PCA(n_components=128), RandomForestClassifier(max_depth=5, max_features="log2", class_weight="balanced", n_estimators=200, n_jobs=40))),
    ('svm', make_pipeline(StandardScaler(), PCA(n_components=128), SVC(class_weight="balanced", probability=True, C=1, kernel="rbf", gamma="scale"))),
    ('hgboost', make_pipeline(StandardScaler(), PCA(n_components=64), HistGradientBoostingClassifier(learning_rate=0.1, max_leaf_nodes=31, min_samples_leaf=40, class_weight="balanced")))
]
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
model_scaffold=StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(class_weight="balanced"), n_jobs=-1, cv=cv
)

In [None]:
np.random.seed(0)
idx_sca=[]
idx_non=[]
idx_cli=[]
sca_score_cli=[]
sca_score_non=[]
sca_score_sca=[]
skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
i=0
for train, test in skf.split(x_all,y_all):
    model_scaffold.fit(x_all[train], y_all[train])
    
    x_test=x_all[test]
    y_test=y_all[test]
    is_client_test=is_client[test]
    is_nonllps_test=is_nonllps[test]
    idx_test=idx_all[test]
    x_test_sca=x_test[y_test==True]
    x_test_non=x_test[is_nonllps_test==True]
    x_test_cli=x_test[is_client_test==True]
    idx_test_sca=idx_test[y_test==True]
    idx_test_non=idx_test[is_nonllps_test==True]
    idx_test_cli=idx_test[is_client_test==True]
    
    y_pred_sca=model_scaffold.predict_proba(x_test_sca)[:,1]
    y_pred_non=model_scaffold.predict_proba(x_test_non)[:,1]
    y_pred_cli=model_scaffold.predict_proba(x_test_cli)[:,1]
    sca_score_sca.append(y_pred_sca)
    sca_score_non.append(y_pred_non)
    sca_score_cli.append(y_pred_cli)
    idx_sca.append(idx_test_sca)
    idx_non.append(idx_test_non)
    idx_cli.append(idx_test_cli)
    i+=1
sca_score_sca=np.concatenate(sca_score_sca)
sca_score_non=np.concatenate(sca_score_non)
sca_score_cli=np.concatenate(sca_score_cli)
idx_sca=np.concatenate(idx_sca)
idx_non=np.concatenate(idx_non)
idx_cli=np.concatenate(idx_cli)

sca_score_cli={k:v for k,v in zip(idx_cli, sca_score_cli)}
sca_score_non={k:v for k,v in zip(idx_non, sca_score_non)}
sca_score_sca={k:v for k,v in zip(idx_sca, sca_score_sca)}

In [None]:
print(len(sca_score_cli), len(sca_score_non), len(sca_score_sca))

In [None]:
df_score_sca=pd.DataFrame(list(sca_score_sca.values()), index=sca_score_sca.keys(), columns=['Scaffold'])
df_score_sca['Client'] = pd.Series(cli_score_sca)
df_score_cli=pd.DataFrame(list(sca_score_cli.values()), index=sca_score_cli.keys(), columns=['Scaffold'])
df_score_cli['Client'] = pd.Series(cli_score_cli)
df_score_non=pd.DataFrame(list(sca_score_non.values()), index=sca_score_non.keys(), columns=['Scaffold'])
df_score_non['Client'] = pd.Series(cli_score_non)

In [None]:
df_score_sca["Label"]="Scaffold"
df_score_cli["Label"]="Client"
df_score_non["Label"]="Non-LLPS"

In [None]:
fig = plt.figure(figsize=(6,6))
plt.scatter(x=df_score_non.loc[:,"Client"], y=df_score_non.loc[:,"Scaffold"], c="gray", label="Non-LLPS", s=0.5)
plt.scatter(x=df_score_cli.loc[:,"Client"], y=df_score_cli.loc[:,"Scaffold"], c="orange", label="Client", s=0.8)
plt.scatter(x=df_score_sca.loc[:,"Client"], y=df_score_sca.loc[:,"Scaffold"], c="b", label="Scaffold", s=2)

plt.legend()
plt.xlabel("Client score")
plt.ylabel("Scaffold score")
plt.savefig("result/fig6a.pdf")
plt.show()

In [None]:
df_all=pd.concat([df_score_sca, df_score_cli, df_score_non])

In [None]:
fig = plt.figure(figsize=(6,6))
ax=sns.kdeplot(x=df_all.Client, y=df_all.Scaffold, hue=df_all.Label, 
                 common_norm=False, palette={"Client":"darkorange", "Scaffold":"blueviolet", "Non-LLPS":"lightgrey"},
           label=["Non-LLPS", "Client", "Scaffold"],
              thresh=.33)
sns.move_legend(ax, "upper left")
plt.xlabel("Client score")
plt.ylabel("Scaffold score")
plt.xlim([-0.1,1.199])
plt.ylim([-0.1,1.199])
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.savefig("result/fig6b.pdf")
plt.show()

In [None]:
fig = plt.figure(figsize=(5,.8))
sns.kdeplot(x=df_all.Client, hue=df_all.Label, palette={"Client":"darkorange", "Scaffold":"blueviolet", "Non-LLPS":"lightgrey"},
           common_norm=False, fill=True,legend=False)
plt.xlabel("")
plt.ylabel("")
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.yticks([])
plt.savefig("result/fig6bx.pdf")
plt.show()

In [None]:
fig = plt.figure(figsize=(.8,5))
sns.kdeplot(y=df_all.Scaffold, hue=df_all.Label, palette={"Client":"darkorange", "Scaffold":"blueviolet", "Non-LLPS":"lightgrey"},
           common_norm=False, fill=True,legend=False)
plt.xlabel("")
plt.ylabel("")
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.xticks([])
plt.savefig("result/fig6by.pdf")
plt.show()