## Dependency
- [CD-HIT](https://sites.google.com/view/cd-hit)

In [None]:
from Bio import SeqIO

In [None]:
def clustering(inputfile, dbfile, species, outputdirectory):
    species_nospace="_".join(species.split())
    !cd-hit -i $inputfile -o clustered_file_$species_nospace -c 0.5 -n 2 -M 0 -T 64
    scaffold_ids = !tail -n +2 $dbfile |grep "$species" | grep "Scaffold" | cut -f 2
    regulator_ids = !tail -n +2 $dbfile |grep "$species" | grep "Regulator" | cut -f 2
    client_ids = !tail -n +2 $dbfile |grep "$species" | grep "Client" | cut -f 2
    with open("clustered_file_" + species_nospace + ".clstr", "r") as f:
        f_read=f.read()
        lines=f_read.split("\n")
        parsed=[]
        for l in lines:
            if len(l)==0:
                continue
            if l[0]==">":
                try:
                    parsed.append(clstr)
                    clstr=[]
                except NameError:
                    clstr=[]
            else:
                name=l.split("|")[1]
                length=int(l.split()[1].split("a")[0])
                clstr.append((name,length))
        if len(clstr)>0:
            parsed.append(clstr)
    client=set(client_ids)
    scaffold=set(scaffold_ids)
    regulator=set(regulator_ids)

    client_clstrd=set()
    scaffold_clstrd=set()
    others_clstrd=set()

    n_client_and_scaffold=0
    n_regulator=0 #for debug

    no_representative=0

    for clstr in parsed:
        longest_client=("", 0)
        longest_scaffold=("", 0)
        longest=("", 0)
        regulator_flag=False
        for seq in clstr:
            if seq[0] in client:
                if longest_client[1]<seq[1]:
                    longest_client=seq
            elif seq[0] in scaffold:
                if longest_scaffold[1]<seq[1]:
                    longest_scaffold=seq
            elif seq[0] in regulator:
                regulator_flag=True
            else:
                if longest[1]<seq[1]:
                    longest=seq

        if longest_client[1]>0:
            client_clstrd.add(longest_client[0])
            if longest_scaffold[1]>0:
                n_client_and_scaffold+=1
        if longest_scaffold[1]>0:
            scaffold_clstrd.add(longest_scaffold[0])
        if longest_client[1]<=0 and longest_scaffold[1]<=0:
            if regulator_flag==True:
                n_regulator+=1
            elif longest[1]==0:
                raise RuntimeError("Error!")
            else:
                others_clstrd.add(longest[0])
    if len(parsed)!= len(client_clstrd)+len(scaffold_clstrd)+len(others_clstrd)-n_client_and_scaffold+n_regulator: #for debug
        raise RuntimeError("Error!")
    
    client_fasta=outputdirectory+"drllps_client_clstr_"+species_nospace+".fasta"
    scaffold_fasta=outputdirectory+"drllps_scaffold_clstr_"+species_nospace+".fasta"
    nonllps_fasta=outputdirectory+"drllps_nonllps_clstr_"+species_nospace+".fasta"
    with open(client_fasta, "w") as cli, open(scaffold_fasta, "w") as sca, open(nonllps_fasta, "w") as non:
        for rec in SeqIO.parse(inputfile, "fasta"):
            name=rec.id.split("|")[1]
            if name in client_clstrd:
                SeqIO.write(rec, cli, "fasta")
            elif name in scaffold_clstrd:
                SeqIO.write(rec, sca, "fasta")
            elif name in others_clstrd:
                SeqIO.write(rec, non, "fasta")

In [None]:
inputfile="data/swiss_arabi_221216.fasta"
dbfile="data/DrLLPS_230423.txt"
species="Arabidopsis thaliana"
outputdirectory="result/"

In [None]:
clustering(inputfile, dbfile, species, outputdirectory)

In [None]:
inputfile="data/swiss_mouse_221216.fasta"
dbfile="data/DrLLPS_230423.txt"
species="Mus musculus"
outputdirectory="result/"

In [None]:
clustering(inputfile, dbfile, species, outputdirectory)

In [None]:
inputfile="data/swiss_prot_yeast_220916.fasta"
dbfile="data/DrLLPS_230423.txt"
species="Saccharomyces cerevisiae"
outputdirectory="result/"

In [None]:
clustering(inputfile, dbfile, species, outputdirectory)