expected csv file column names:

ID </br>
uniprotEnd </br>
uniprotSequence </br>

## CD-HIT clustering

CD-HIT expects data as FASTA format and output them in a specific format (we use `cdhit_reader` to parse it)

In [1]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import gzip
from random import shuffle

In [2]:
FILE_OUT_NAME = "clustered_v3.csv.gz"

## FASTA input

In [3]:
MINLENGTH = 70
MAXLENGTH = 1000
# for most proteins uniprotEnd == Length but there are rare exceptions in knotted d.f.

knotted = pd.read_csv("../david_all_knotted_NEW.csv.gz", delimiter="\t").query(f"uniprotEnd >= {MINLENGTH}").query(f"uniprotEnd <= {MAXLENGTH}")
unknotted = pd.read_csv("../david_all_unknotted_NEW.csv.gz", delimiter="\t").query(f"uniprotEnd >= {MINLENGTH}").query(f"uniprotEnd <= {MAXLENGTH}")

In [4]:
unknotted

Unnamed: 0,ID,latestVersion,globalMetricValue,uniprotStart,uniprotEnd,uniprotSequence,Length
0,AF-A0A009NLX4-F1,4,94.56,1,306,MALRHFLTLRDLSTLELNRILERASELKKMQHDNKVYQPFVGKVLG...,306
1,AF-A0A010QHM8-F1,4,74.75,1,575,MSQLRDEKDNTTLNDSKESTNPKVVVDSVFDTSEKLFLGGIDDGSD...,575
2,AF-A0A010R445-F1,4,91.31,1,353,MKQTAFRVTRQALNGAQSRAYSQSTGPRHLMSIADLSPAEFATLVR...,353
3,AF-A0A011RB99-F1,4,83.81,1,762,MAPPGDTAGPAWERRLHGYREQLACGFPQVADVFAECMREALAVLS...,762
4,AF-A0A014C010-F1,4,94.69,1,306,MALRHFLTLRDLSTLELNRVLQRASELKKMQQSNKVYQPFVGKVLG...,306
...,...,...,...,...,...,...,...
325243,AF-X6M7S0-F1,4,90.56,1,355,MSFFKRKIPQLFWNQLRNVSSTRTTKHVLKISDLSQKELRDVLTFA...,355
325244,AF-X7EDX4-F1,4,95.50,1,308,MNHFLDIHKTDPSDLEAILTQASAMKAARNGAPKATPDAEQPLKNH...,308
325245,AF-X7YPC5-F1,4,92.25,1,107,MSYDLDAELPAADAVLMLRVQAERMHGGFFPSAREYSVLYGLSDRR...,107
325246,AF-X7ZDJ7-F1,4,76.44,1,708,MGRHSLPDPEDSADEPPDEYAAEQQDWADQIADQPGGGRHSEVGYP...,708


In [5]:
knotted_records = [SeqRecord(Seq(seq), id=ID, description="knotted") for seq, ID in zip(knotted['uniprotSequence'], knotted['ID'])]
unknotted_records = [SeqRecord(Seq(seq), id=ID, description="unknotted") for seq, ID in zip(unknotted['uniprotSequence'], unknotted['ID'])]

In [6]:
import random

records = knotted_records + unknotted_records
shuffle(records)

SeqIO.write(records, "all.fasta", "fasta")

763683

## 2) CD-HIT

In [7]:
# Takes long time, better run it in the terminal
#!cd-hit -i all.fasta -o all_clustered -c 0.9 -s 0.8 -G 0 -aS 0.9 -T 0
# CD-HIT version 4.8.1 (built on Aug 20 2021)

## 3) Parsing CD-HIT output

In [7]:
from cdhit_reader import read_cdhit

In [8]:
clusters = read_cdhit("./all_clustered.clstr").read_items()

In [9]:
knotted = knotted.set_index("ID")
unknotted = unknotted.set_index("ID")

knotted = knotted.drop_duplicates()
unknotted = unknotted.drop_duplicates()

In [10]:
representants = pd.Series([cl.refname for cl in clusters])

In [11]:
unknotted_sel = unknotted.loc[representants[representants.isin(unknotted.index)]]
knotted_sel = knotted.loc[representants[representants.isin(knotted.index)]]

len(knotted_sel), len(unknotted_sel)

(200477, 156906)

In [12]:
output = pd.concat([knotted_sel, unknotted_sel], axis=0).sample(frac=1)

In [14]:
output.to_csv(FILE_OUT_NAME)

##  4) Search for non-trivial cluster (= both knotted and unknotted proteins), minimal edit distance

Work in progress

In [21]:
nontrivial = [cl for cl in clusters if len(cl.sequences)>1]

In [22]:
len(nontrivial)

99682

In [24]:
from tqdm.notebook import tqdm

In [29]:
for cl in tqdm(nontrivial[19:]):
    if not (all([x.name in knotted.index for x in cl.sequences]) or all([x.name in unknotted.index for x in cl.sequences])):
        break

  0%|          | 0/99663 [00:00<?, ?it/s]

In [31]:
[(x.name, x.name in knotted.index) for x in nontrivial[27].sequences]

[('AF-A0A6P6BU57-F1', False),
 ('AF-A0A5F9DHF8-F1', True),
 ('AF-A0A091DW59-F1', True),
 ('AF-A0A7N5JDS6-F1', False),
 ('AF-G9K6A8-F1', True),
 ('AF-A0A3Q0E8A1-F1', True),
 ('AF-D2HVG5-F1', False),
 ('AF-A0A673UQ91-F1', True),
 ('AF-A0A2K5N299-F1', False)]

In [32]:
nontrivial[27].sequences

[ClusterSequence(id=0, name=AF-A0A6P6BU57-F1, length=899, identity=90.05, is_ref=False, seqtype=SeqType.PROTEIN, strand=Strand.NONE),
 ClusterSequence(id=1, name=AF-A0A5F9DHF8-F1, length=963, identity=92.36, is_ref=False, seqtype=SeqType.PROTEIN, strand=Strand.NONE),
 ClusterSequence(id=2, name=AF-A0A091DW59-F1, length=995, identity=90.04, is_ref=False, seqtype=SeqType.PROTEIN, strand=Strand.NONE),
 ClusterSequence(id=3, name=AF-A0A7N5JDS6-F1, length=995, identity=92.86, is_ref=False, seqtype=SeqType.PROTEIN, strand=Strand.NONE),
 ClusterSequence(id=4, name=AF-G9K6A8-F1, length=993, identity=92.55, is_ref=False, seqtype=SeqType.PROTEIN, strand=Strand.NONE),
 ClusterSequence(id=5, name=AF-A0A3Q0E8A1-F1, length=1000, identity=100.0, is_ref=True, seqtype=SeqType.PROTEIN, strand=Strand.NONE),
 ClusterSequence(id=6, name=AF-D2HVG5-F1, length=994, identity=92.76, is_ref=False, seqtype=SeqType.PROTEIN, strand=Strand.NONE),
 ClusterSequence(id=7, name=AF-A0A673UQ91-F1, length=993, identity=90.

In [36]:
unknotted.loc['AF-D2HVG5-F1'].uniprotSequence

'MSAGAGRGPRGSRAQPLAPLWCFSAALGMLWTPASQAFNLDVDKLTVYSGPEGSYFGYAVDFHIPAARTASVLVGAPKANTSQPDIVEGGAVYYCPWPAEGSAQCKQIPFDNTNNRKIRVNGTKEPIEFKSNQWFGATVKAHKGEVVACAPLYHWRTLKPTPEKDPVGTCYVAIQNFSAYAEYSPCRNSNADPEGQGYCQAGFSLDFYKNGDLIVGGPGSFYWQGQVITASIADIIANYSFKDILRKLAREKQTDVAPASYDDSYLGYSVAAGEFTGDSEQELVAGVPRGAQNFGYVSIINSTDMTFIQNFTGEQMASYFGYTVTVSDVNNDGMDDVLIGAPLFMERESESSPREVGQVYLYLQVSALVFRDPQILGGTEVFGRFGSAVAHLGDLNQDGYNDIAIGSPFAGKDQRGQVLIYNGNKDGLNTKPSQILQGVWASQTIPSGFGFTLRGDSDIDKNDYPDLIVGAFGTGKVAVYRARPVVTVDAQLLLHPMIINLENKTCQIPESLSVVACFSLRVCVSVTGQSISNTIGLTAELHLDSLKQKGAIKRTLFLDNHQSHRIFPLPVKRQKSLQCQDFVVYLRDETEFRDKLSPINISLNYSLDESTFKEGLEVKPILNYYRENTVTEQAHILVDCGEDNVCIPDLKLSARPDKNQVIIGDENHLTLIINARNEGEGAYEAELFVMIPEEADYVGIERSNKGLRPLSCEYKMENITRMVVCDLGNPMVAGTNYSLGLRFAVPRLEKTNMSINFDLQIRSSNKDNPDSNFVSLQINITAVAQVEIRGVSHPPQIVLPIHNWEPEEEPHKEEGVGPLVEHIYELHNIGPSTISDTLLEVGWPFSARDEFLLYIFHIQTLGPLQCQTNPDINPQDIKPAAPPEDTPELSAFLRNSTIPHLVRKRDVRMPEPHRQSPAKILNCTNIECLQISCAVGRLEGGESAVLKVRSRLWAKTFLQRKNDPYSLASLVSFKVKKMPYKDQPAKLPEGSIAV'

In [37]:
knotted.loc['AF-A0A673UQ91-F1'].uniprotSequence

'ASVLVGAPKANTSQPDIVEGGAVYYCPWPSEGSAQCKQIPFDNTNNRKIRVNGTKQPIEFKSNQWFGATVKAHKGEVVACAPLYHWRTLKPTPEKDPVGTCYVAIQNFSAYAEYSPCRNSNADPEGQGYCQAGFSLDFYKNGDLIVGGPGSFYWQGQVITASIADIIANYSFKDILRKLAREKQTDVAPASYDDSYLGYSVAAGEFTGDSEQELVAGVPRGAQNFGYVSIINSTDMTFIQNFTGEQMASYFGYTVTVSDVNNDGMDDVLIGAPLFMEREFDTSPREVGQVYLYLQVSALVFRDPQILAGTDVFGRFGSAVAHLGDLNQDGYNDIAIGAPFAGKDQRGKVLIYNGQKDGLNTQASQILQGLWASQTVPSGFGFTLRGDSDIDKNDYPDLIVGAFGTGKVAVYRARPVVTVDAQLLLHPMIINLENKTCQMPESVSPVACFSLKVCASVAGQSISNTIGLTVEVHLDTLKQKGAVKRTLFLDNNESHIIFPLTTQRQKPFQCRDFVVYLRDETEFRDKLSPINISLNYSLDEATFKEGLEVKPILNYYRENTVTEQAHILVDCGEDNMCIPDLKLSARPDKYQVIIGDENHLMLIINARNEGEGAYEAELFVIIPEEADYVGIERSNKGLRPLSCEYKMENVTRMVVCDLGNPMLAGTNYSLGLRFTVPRLEKTNMSINFELQIRSSNKDNPDSNFVSLQINITAVAQVEIRGVSHPPQIVLPIHNWEPEEEPHKEEGVGPLVEHIYELHNIGPSTISDTLLEVGWPFSARDEFLLYIFHIQTLGPLRCQTNPDINPQDIKPVTAPEETPELSAFLRNSTIPHLVRKRDVHAPELHRQSPAKILNCTNIECLQISCMVGRLEGGESAVLKVRSRLWAKTFLQRKNDPYSLASLVSFKVKKMPYQDQPAKLPEGSMAIKTSVIWATPNVSFSIPLWVIILAILLGLLVLAILTLALWKCGFFDRARPPQDDMNDREQLTDEKTVET'

In [40]:
def edit_distance(s1, s2):
    # source: https://stackoverflow.com/questions/2460177/edit-distance-in-python
    
    m=len(s1)+1
    n=len(s2)+1

    tbl = {}
    for i in range(m): tbl[i,0]=i
    for j in range(n): tbl[0,j]=j
    for i in range(1, m):
        for j in range(1, n):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            tbl[i,j] = min(tbl[i, j-1]+1, tbl[i-1, j]+1, tbl[i-1, j-1]+cost)

    return tbl[i,j]

edit_distance(unknotted.loc['AF-A0A7N5JDS6-F1'].uniprotSequence, knotted.loc['AF-A0A3Q0E8A1-F1'].uniprotSequence)


197