## Dependencies
- [IUPred3](https://iupred3.elte.hu)
- [PScore](https://doi.org/10.7554/eLife.31486)
- [SEG](https://doi.org/10.1016/0097-8485(93)85006-X)

In [None]:
inputfile="data/swiss_prot_human_220916.fasta"

In [None]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from statistics import mean

In [None]:
seq_id=[]
seq_dict={}
for rec in SeqIO.parse(inputfile, "fasta"):
    seq_id.append(rec.id)
    seq_dict[rec.id]=str(rec.seq)

In [None]:
df=pd.DataFrame(index=seq_id, columns=["Hydrophibicity", "ChargedAA", "LCR", "IDR", "Length"])
df = df.applymap(lambda x: np.nan)

In [None]:
kd = { 'A': 1.8,'R':-4.5,'N':-3.5,'D':-3.5,'C': 2.5,
       'Q':-3.5,'E':-3.5,'G':-0.4,'H':-3.2,'I': 4.5,
       'L': 3.8,'K':-3.9,'M': 1.9,'F': 2.8,'P':-1.6,
       'S':-0.8,'T':-0.7,'W':-0.9,'Y':-1.3,'V': 4.2, 'U': 0 }
Hydrophibicity = [mean([kd[c] for c in seq_dict[k]]) for k in seq_id]
df["Hydrophibicity"] = Hydrophibicity

In [None]:
charge = { 'A': 0,'R':1,'N':0,'D':1,'C':0,
       'Q':0,'E':1,'G':0,'H':0,'I': 0,
       'L': 0,'K':1,'M': 0,'F': 0,'P':0,
       'S':0,'T':0,'W':0,'Y':0,'V': 0, 'U': 0 }
ChargedAA = [mean([charge[c] for c in seq_dict[k]]) for k in seq_id]
df["ChargedAA"] = ChargedAA

In [None]:
from iupred3 import iupred3_lib
idr=[]
for k in seq_id:
    try:
        iupred_score=iupred3_lib.iupred(seq_dict[k])
        iupred_score=mean(iupred_score[0])
    except:
        iupred_score=np.nan
    idr.append(iupred_score)

In [None]:
df["IDR"] = idr

In [None]:
path_to_seg="/path/to/seg"
outputfile="human_lcr.fasta"

In [None]:
!$path_to_seg $inputfile -x > $outputfile

In [None]:
Length=[]
LCR=[]
for rec in SeqIO.parse(outputfile, "fasta"):
    seq=str(rec.seq)
    leng=len(seq)
    n_lcr=seq.count("x")
    Length.append(leng)
    LCR.append(n_lcr/leng)

In [None]:
df["Length"] = Length
df["LCR"] = LCR

In [None]:
def read_file_to_dict(file_name):
    result_dict = {}
    
    with open(file_name, 'r') as file:
        for line in file:
            # Split line by spaces and remove empty strings
            columns = list(filter(None, line.split(' ')))
            
            # Assign values from columns
            value = float(columns[1])
            id_ = columns[2].strip()
            
            # Add to the result dictionary
            result_dict[id_[1:]] = value
            
    return result_dict

In [None]:
path_to_pscore="/path/to/pscore"
!python $path_to_pscore result/drllps_client_clstr_Homo_sapiens.fasta -output result/human_client_pscore.txt
!python $path_to_pscore result/drllps_scaffold_clstr_Homo_sapiens.fasta -output result/human_scaffold_pscore.txt
!python $path_to_pscore result/drllps_nonllps_clstr_Homo_sapiens.fasta -output result/human_nonllps_pscore.txt

In [None]:
file_name = 'result/human_scaffold_pscore.txt'
scaffold_dict = read_file_to_dict(file_name)
file_name = 'result/human_client_pscore.txt'
client_dict = read_file_to_dict(file_name)
file_name = 'result/human_nonllps_pscore.txt'
nonllps_dict = read_file_to_dict(file_name)
ps_dict=scaffold_dict | client_dict | nonllps_dict
for k in ps_dict:
    df.at[k,"PScore"]=ps_dict[k]

In [None]:
client_set=set(SeqIO.index("result/drllps_client_clstr_Homo_sapiens.fasta", "fasta").keys())
scaffold_set=set(SeqIO.index("result/drllps_scaffold_clstr_Homo_sapiens.fasta", "fasta").keys())
nonllps_set=set(SeqIO.index("result/drllps_nonllps_clstr_Homo_sapiens.fasta", "fasta").keys())

In [None]:
df_cli=df.loc[list(client_set)]
df_sca=df.loc[list(scaffold_set)]
df_non=df.loc[list(nonllps_set)]

In [None]:
df_cli["label"]="client"
df_sca["label"]="scaffold"
df_non["label"]="non-LLPS"
df_all=pd.concat([df_sca, df_cli, df_non])

In [None]:
df_all

In [None]:
df_all.to_csv("result/human_clstr_fea.csv")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.set_palette(["mediumpurple", "orange", "lightgray"])
fig = plt.figure(figsize=(10,8))
ax=fig.add_subplot(231)
sns.stripplot(data=df_all, x="label", y="IDR", size=2, alpha=.2, jitter=.35)
sns.violinplot(data=df_all, x="label", y="IDR", linewidth=1, ax=ax, cut=0, color="w")

#ax.tick_params(labelright=False, labeltop=False)
plt.ylabel("Disorder",fontsize=15)
plt.xlabel("")
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.tick_params(labelsize=13)
ax=fig.add_subplot(232)
sns.stripplot(data=df_all, x="label", y="Hydrophibicity", size=2, alpha=.2, jitter=.35)
sns.violinplot(data=df_all, x="label", y="Hydrophibicity", linewidth=1, ax=ax, cut=0, color="w")
plt.ylabel("Hydrophobicity",fontsize=15)
plt.xlabel("")
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.tick_params(labelsize=13)
ax=fig.add_subplot(234)
sns.stripplot(data=df_all, x="label", y="ChargedAA", size=2, alpha=.2, jitter=.35)
sns.violinplot(data=df_all, x="label", y="ChargedAA", linewidth=1, ax=ax, cut=0, color="w")
plt.ylabel("Charged AA",fontsize=15)
plt.xlabel("")
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.tick_params(labelsize=13)
ax=fig.add_subplot(236)
sns.stripplot(data=df_all, x="label", y="PScore", size=2, alpha=.2, jitter=.35)
sns.violinplot(data=df_all, x="label", y="PScore", linewidth=1, ax=ax, cut=0, color="w")
plt.ylabel("PScore",fontsize=15)
plt.xlabel("")
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.tick_params(labelsize=13)
ax=fig.add_subplot(233)
sns.stripplot(data=df_all, x="label", y="Length", size=2, alpha=.2, jitter=.35)
sns.violinplot(data=df_all, x="label", y="Length", linewidth=1, ax=ax, cut=0, color="w")
plt.ylabel("Length",fontsize=15)
plt.xlabel("")
plt.yscale("log")
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.tick_params(labelsize=13)
ax=fig.add_subplot(235)
sns.stripplot(data=df_all, x="label", y="LCR", size=2, alpha=.2, jitter=.35)
sns.violinplot(data=df_all, x="label", y="LCR", linewidth=1, ax=ax, cut=0, color="w")
plt.ylabel("Low complexity",fontsize=15)
plt.xlabel("")
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.tick_params(labelsize=13)
plt.tight_layout()
plt.savefig("result_fig1b.pdf")
plt.show()