## Dependencies
- [IUPred3](https://iupred3.elte.hu)
- [SEG](https://doi.org/10.1016/0097-8485(93)85006-X)

In [None]:
inputfile="../fig1/data/swiss_prot_human_220916.fasta"

In [None]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from statistics import mean

In [None]:
seq_id=[]
seq_dict={}
for rec in SeqIO.parse(inputfile, "fasta"):
    seq_id.append(rec.id)
    seq_dict[rec.id]=str(rec.seq)

In [None]:
df=pd.DataFrame(index=seq_id, columns=["Hydrophibicity", "ChargedAA", "LCR", "IDR", "Length"])
df=df.applymap(lambda x: np.nan)

In [None]:
kd = { 'A': 1.8,'R':-4.5,'N':-3.5,'D':-3.5,'C': 2.5,
       'Q':-3.5,'E':-3.5,'G':-0.4,'H':-3.2,'I': 4.5,
       'L': 3.8,'K':-3.9,'M': 1.9,'F': 2.8,'P':-1.6,
       'S':-0.8,'T':-0.7,'W':-0.9,'Y':-1.3,'V': 4.2, 'U': 0 }
Hydrophibicity = [mean([kd[c] for c in seq_dict[k]]) for k in seq_id]
df["Hydrophibicity"] = Hydrophibicity

In [None]:
charge = { 'A': 0,'R':1,'N':0,'D':1,'C':0,
       'Q':0,'E':1,'G':0,'H':0,'I': 0,
       'L': 0,'K':1,'M': 0,'F': 0,'P':0,
       'S':0,'T':0,'W':0,'Y':0,'V': 0, 'U': 0 }
ChargedAA = [mean([charge[c] for c in seq_dict[k]]) for k in seq_id]
df["ChargedAA"] = ChargedAA

In [None]:
from iupred3 import iupred3_lib
idr=[]
for k in seq_id:
    try:
        iupred_score=iupred3_lib.iupred(seq_dict[k])
        iupred_score=mean(iupred_score[0])
    except:
        iupred_score=np.nan
    idr.append(iupred_score)

In [None]:
df["IDR"] = idr

In [None]:
path_to_seg="/path/to/seg"
outputfile="human_lcr.fasta"

In [None]:
!$path_to_seg $inputfile -x > $outputfile

In [None]:
Length=[]
LCR=[]
for rec in SeqIO.parse(outputfile, "fasta"):
    seq=str(rec.seq)
    leng=len(seq)
    n_lcr=seq.count("x")
    Length.append(leng)
    LCR.append(n_lcr/leng)

In [None]:
df["Length"] = Length
df["LCR"] = LCR

In [None]:
AAs="ACDEFGHIKLMNPQRSTVWY"
AA=[]
diAA=[]
for a in AAs:
    AA.append(a)
    for b in AAs:
        diAA.append(a+b)

In [None]:
def countAA(seq):
    count={x:0 for x in AA}
    for c in seq:
        try:
            count[c]+=1
        except:
            pass
    length=len(seq)
    count=[count[k]/length for k in AA]
    
    countdi={x:0 for x in diAA}
    for i in range(len(seq)-1):
        di=seq[i:i+2]
        try:
            countdi[di]+=1
        except:
            pass
    countdi=[countdi[k]/(length-1) for k in diAA]
    return count+countdi

In [None]:
dfaa=pd.DataFrame(index=seq_id, columns=AA+diAA)
dfaa = dfaa.applymap(lambda x: np.nan)

In [None]:
for sid in seq_id:
    seq=seq_dict[sid]
    dfaa.loc[sid] = countAA(seq)

In [None]:
dfaa

In [None]:
dfall=pd.concat([df,dfaa], axis=1)
print(dfall)

In [None]:
df_to_dict={k:np.array(dfall.loc[k]) for k in dfall.index}

In [None]:
np.save("human_feature.npy", df_to_dict) 