# Computing overlaps of knot cores with minimums

In [1]:
from datasets import Dataset, load_dataset
import pandas as pd
from tqdm import tqdm

In [2]:
PATCH_SIZE = 40
HF_DATASET = f'roa7n/patched_1000_test_p_{PATCH_SIZE}_m2_predictions'
OUTPUT = f'/home/jovyan/data/proteins_m2/preds_{PATCH_SIZE}_minimums.csv'

In [3]:
tqdm.pandas()

In [4]:
hf_dataset = load_dataset(HF_DATASET)
hf_dataset

Using custom data configuration roa7n--patched_1000_test_p_40_m2_predictions-733cadb9a8b05715


Downloading and preparing dataset None/None to /home/jovyan/.cache/huggingface/datasets/roa7n___parquet/roa7n--patched_1000_test_p_40_m2_predictions-733cadb9a8b05715/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/468M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/942535 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/roa7n___parquet/roa7n--patched_1000_test_p_40_m2_predictions-733cadb9a8b05715/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'sequence_str', 'label', 'features', 'm2_preds'],
        num_rows: 942535
    })
})

In [5]:
df = hf_dataset['train'].to_pandas()
print(df.shape)

(942535, 5)


In [6]:
df

Unnamed: 0,id,sequence_str,label,features,m2_preds
0,A0A533UME0_40_-1,M K L S I A I P D S S V S D E S T Q L G K S M ...,1,"[-0.0003535352272479031, -0.003224661822912367...",0.654722
1,A0A533UME0_40_0,X X X X X X X X X X X X X X X X X X X X X X X ...,1,"[-0.004555738866244496, -0.005816199156441826,...",0.913441
2,A0A533UME0_40_1,M X X X X X X X X X X X X X X X X X X X X X X ...,1,"[-0.004163659869533376, -0.006157918056800327,...",0.887685
3,A0A533UME0_40_2,M K X X X X X X X X X X X X X X X X X X X X X ...,1,"[-0.003944790247551566, -0.006149100786586814,...",0.862050
4,A0A533UME0_40_3,M K L X X X X X X X X X X X X X X X X X X X X ...,1,"[-0.0034333367500494205, -0.006310329959745786...",0.915727
...,...,...,...,...,...
942530,A0A3N5VR99_40_220,M Q T N L H G R D L I S D L D F S K E E V E T ...,1,"[0.005500724725429507, -0.010576869343893236, ...",0.997332
942531,A0A3N5VR99_40_221,M Q T N L H G R D L I S D L D F S K E E V E T ...,1,"[0.00519484348943422, -0.011124183164056944, -...",0.997279
942532,A0A3N5VR99_40_222,M Q T N L H G R D L I S D L D F S K E E V E T ...,1,"[0.00539999533751967, -0.010002495486048701, -...",0.993463
942533,A0A3N5VR99_40_223,M Q T N L H G R D L I S D L D F S K E E V E T ...,1,"[0.005613148182163741, -0.01017291308618723, -...",0.994330


## Calculate minimum for each sequence

1. Filter out all predictions of one patched sequence
2. Check that the original sequence was predicted as having a knot
3. Take minimum out of patched versions
4. Save

1. Create a list of IDs from original CSV file:

In [7]:
INPUT = '/home/jovyan/data/proteins_m3_v2/raw_inputs/raw_new_data_with_core_intervals.csv'
CSV_DELIMITER = ','

import csv

with open(INPUT, newline='') as f:
    reader = csv.reader(f, delimiter=CSV_DELIMITER)
    data = list(reader)

data[:2]

[['seq_id',
  'sequence',
  'global_metric_value',
  'domain_architecture',
  'interpro',
  'max_knot_topology',
  'seq_length',
  'label',
  'family',
  'knot_start',
  'knot_end',
  'knot_len',
  'core_percentage'],
 ['A0A533UME0',
  'MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHESSGSDRDRSLIRTILKYLETPQYLRRGLFQKISELKFAGSLSPLKIPHHTYTSDSHKIKAGDIREGMIVFAKGRKFVDVGLDQIITYSGEDKEGKRVTMQFKTGYPELLAKQISRNEIKQYWGYEVKESANLRTLLSGWNSNVILTTKKGKTIHKVQKYFDEISNNPVLVVFGSPERGIHEILGISIKEIPKSQNLNFFPEQATETVRLEEAILGTLAILNILIRN',
  '92.81',
  'PF02598;',
  'IPR029028;IPR012340;IPR003750;IPR029026;',
  '3_1',
  '271',
  '1',
  'SPOUT',
  '38',
  '246',
  '208',
  '76.75276752767527']]

In [8]:
df_raw = pd.DataFrame(data[1:], columns=data[0])
del(data)
df_raw

Unnamed: 0,seq_id,sequence,global_metric_value,domain_architecture,interpro,max_knot_topology,seq_length,label,family,knot_start,knot_end,knot_len,core_percentage
0,A0A533UME0,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,92.81,PF02598;,IPR029028;IPR012340;IPR003750;IPR029026;,3_1,271,1,SPOUT,38,246,208,76.75276752767527
1,A0A2S2NXP0,MKAIGSSFSVNFRRILCHNVTARTYVSDTKGSDKKQSNDDILNSIK...,77.0,PF00588;,IPR029028;IPR029064;IPR001537;IPR013123;IPR029...,3_1,376,1,SPOUT,95,307,212,56.38297872340426
2,A0A154BTR5,MSSVYLGLLHHPIYNKNDEIVATAVTNFDIHDIARAARTYDISRYF...,92.88,PF09936;,IPR029028;IPR019230;IPR029026;,3_1,192,1,SPOUT,108,156,48,25.0
3,A0A5J4Z539,MNAIAFGCSSQTWCTVRARHETRIRVRTCASGNARVGEGSNDRKAA...,73.0,PF04452;,IPR029028;IPR006700;IPR046886;IPR029026;,3_1,381,1,SPOUT,287,351,64,16.79790026246719
4,A0A1Q9VZN3,MRWGRAVPDAPVTVTDPTDTRLDDIRDLNSSDRRPDLPGGKGLVVA...,92.19,PF00588;,IPR029028;IPR029064;IPR001537;IPR029026;,3_1,288,1,SPOUT,219,261,42,14.583333333333334
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5725,A0A3P6SS45,MRQRHFHVTVALVRRFSSAKENTDVTYYIPPKTFRPAYYHPMKPSQ...,79.38,PF07147;,IPR039982;IPR010793;,3_1,499,1,ribosomal-mitochondrial,112,466,354,70.94188376753507
5726,A0A1U7S9S7,MAALRKKWSELCRLVQAESATAAAAAATAAAEAGLGPCYPPVVASV...,86.5,PF07147;,IPR039982;IPR010793;,3_1,433,1,ribosomal-mitochondrial,85,415,330,76.21247113163972
5727,U1Q8C1,MRGLFPVDMSDGRAIRIQARNAVYPPAQAPNAVRLLADMELELQRV...,89.88,PF14028;PF04738;,IPR006827;IPR023809;,3_1,490,1,biosynthesis of lantibiotics,342,470,128,26.122448979591837
5728,A0A4Q3GPX1,VLYSATHQCRLVPRMASAYNILRSSHPLLRLLADLQYQGIQYQFLP...,87.94,PF14028;PF04738;,IPR006827;IPR023809;,3_1,444,1,biosynthesis of lantibiotics,317,421,104,23.423423423423422


In [9]:
raw_ids = list(df_raw['seq_id'])
len(raw_ids)

5730

In [10]:
raw_ids[:5]

['A0A533UME0', 'A0A2S2NXP0', 'A0A154BTR5', 'A0A5J4Z539', 'A0A1Q9VZN3']

In [11]:
df_raw = df_raw.set_index('seq_id')
df = df.set_index('id')
df.index = df.index.astype('str')

2. Iterate over IDs in the list:

In [12]:
df[df.index.str.contains('A0A533UME0')]

Unnamed: 0_level_0,sequence_str,label,features,m2_preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A0A533UME0_40_-1,M K L S I A I P D S S V S D E S T Q L G K S M ...,1,"[-0.0003535352272479031, -0.003224661822912367...",0.654722
A0A533UME0_40_0,X X X X X X X X X X X X X X X X X X X X X X X ...,1,"[-0.004555738866244496, -0.005816199156441826,...",0.913441
A0A533UME0_40_1,M X X X X X X X X X X X X X X X X X X X X X X ...,1,"[-0.004163659869533376, -0.006157918056800327,...",0.887685
A0A533UME0_40_2,M K X X X X X X X X X X X X X X X X X X X X X ...,1,"[-0.003944790247551566, -0.006149100786586814,...",0.862050
A0A533UME0_40_3,M K L X X X X X X X X X X X X X X X X X X X X ...,1,"[-0.0034333367500494205, -0.006310329959745786...",0.915727
...,...,...,...,...
A0A533UME0_40_227,M K L S I A I P D S S V S D E S T Q L G K S M ...,1,"[-0.00028049852357980853, -0.00165378707174568...",0.619606
A0A533UME0_40_228,M K L S I A I P D S S V S D E S T Q L G K S M ...,1,"[-0.000509662395131727, -0.0015890351885445853...",0.468908
A0A533UME0_40_229,M K L S I A I P D S S V S D E S T Q L G K S M ...,1,"[-9.08454637533751e-05, -0.0020829805243351984...",0.357207
A0A533UME0_40_230,M K L S I A I P D S S V S D E S T Q L G K S M ...,1,"[-0.0005732211873805682, -0.001576390657163528...",0.420726


In [13]:
import numpy as np

def extract_seq_min_info(df):
    preds = list(df['m2_preds'])
    start_ids = list(df.index)
    starts = []
    for id_str in start_ids:
        starts.append(id_str.split('_')[2])  # take the start index of patch
    min_pred = min(preds)
    min_i = starts[np.argmin(preds)]
    return preds, starts, min_pred, min_i

In [14]:
reduced_data = []

for i in range(len(raw_ids)):
    raw_id = raw_ids[i]
    raw_seq_info = df_raw.loc[raw_id]
    
    if f'{raw_id}_{PATCH_SIZE}_-1' in df.index:
        seq_info = df.loc[f'{raw_id}_{PATCH_SIZE}_-1']
        
        df_seq = df[df.index.str.contains(raw_id)].copy()
        patched_preds, patched_starts, min_pred, min_start = extract_seq_min_info(df_seq)
        del(df_seq)
        
        seq_dict = {'id': raw_id,
                    'sequence_str': raw_seq_info['sequence'],
                    'sequence_pred': seq_info['m2_preds'],
                    'patched_starts': patched_starts,
                    'patched_preds': patched_preds,
                    'min_start': min_start,
                    'min_pred': min_pred,
                    'knot_start': raw_seq_info['knot_start'],
                    'knot_end': raw_seq_info['knot_end'],
                    'family': raw_seq_info['family']}        
    else:
        seq_dict = {'id': raw_id,
                    'sequence_str': raw_seq_info['sequence'],
                    'sequence_pred': None,
                    'patched_starts': None,
                    'patched_preds': None,
                    'min_start': None,
                    'min_pred': None,
                    'knot_start': raw_seq_info['knot_start'],
                    'knot_end': raw_seq_info['knot_end'],
                    'family': raw_seq_info['family']} 
        
    if i % 500 == 0:
        print(f'[{i:4}/{len(raw_ids)}]')
    
    reduced_data.append(seq_dict)

[   0/5730]
[ 500/5730]
[1000/5730]
[1500/5730]
[2000/5730]
[2500/5730]
[3000/5730]
[3500/5730]
[4000/5730]
[4500/5730]
[5000/5730]
[5500/5730]


In [15]:
df_reduced = pd.DataFrame(reduced_data)
df_reduced

Unnamed: 0,id,sequence_str,sequence_pred,patched_starts,patched_preds,min_start,min_pred,knot_start,knot_end,family
0,A0A533UME0,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,0.654722,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.6547220349311829, 0.9134407639503479, 0.887...",229,0.357207,38,246,SPOUT
1,A0A2S2NXP0,MKAIGSSFSVNFRRILCHNVTARTYVSDTKGSDKKQSNDDILNSIK...,0.992380,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.9923796653747559, 0.9998968839645386, 0.999...",335,0.984055,95,307,SPOUT
2,A0A154BTR5,MSSVYLGLLHHPIYNKNDEIVATAVTNFDIHDIARAARTYDISRYF...,0.977540,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.9775395393371582, 0.9376558661460876, 0.919...",138,0.533647,108,156,SPOUT
3,A0A5J4Z539,MNAIAFGCSSQTWCTVRARHETRIRVRTCASGNARVGEGSNDRKAA...,0.974824,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.974823534488678, 0.9995740056037903, 0.9993...",337,0.935046,287,351,SPOUT
4,A0A1Q9VZN3,MRWGRAVPDAPVTVTDPTDTRLDDIRDLNSSDRRPDLPGGKGLVVA...,0.994303,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.9943029880523682, 0.9995948672294617, 0.999...",147,0.979765,219,261,SPOUT
...,...,...,...,...,...,...,...,...,...,...
5725,A0A3P6SS45,MRQRHFHVTVALVRRFSSAKENTDVTYYIPPKTFRPAYYHPMKPSQ...,,,,,,112,466,ribosomal-mitochondrial
5726,A0A1U7S9S7,MAALRKKWSELCRLVQAESATAAAAAATAAAEAGLGPCYPPVVASV...,,,,,,85,415,ribosomal-mitochondrial
5727,U1Q8C1,MRGLFPVDMSDGRAIRIQARNAVYPPAQAPNAVRLLADMELELQRV...,,,,,,342,470,biosynthesis of lantibiotics
5728,A0A4Q3GPX1,VLYSATHQCRLVPRMASAYNILRSSHPLLRLLADLQYQGIQYQFLP...,,,,,,317,421,biosynthesis of lantibiotics


In [16]:
del(reduced_data)

In [17]:
df_reduced['min_start'].value_counts()

-1     513
0       42
212     41
214     36
215     36
      ... 
363      1
275      1
351      1
314      1
179      1
Name: min_start, Length: 359, dtype: int64

In [19]:
df_reduced[df_reduced['patched_preds'].isnull()]

Unnamed: 0,id,sequence_str,sequence_pred,patched_starts,patched_preds,min_start,min_pred,knot_start,knot_end,family
3465,A0A378Z4R7,MKHHWLAPDMTMSAPDYRALLRSALAFKRRYPSHTETALHGKTIYF...,,,,,,172,253,ATCase/OTCase
3466,A0A345H6R4,MMPLSWEIHMKQFTNIYDLESIPQTITEALELKANPFAYETLGKHK...,,,,,,178,245,ATCase/OTCase
3467,A0A662PLA8,MNIKVNLYGRDLITTQDWSIEEIEETISLASEFKRKYKDGESIPKL...,,,,,,170,250,ATCase/OTCase
3468,A0A644ZEF3,MRNFTSFADIGSVSKALEIAREVKENPFGWQELGKNKTLLMIFFNS...,,,,,,170,237,ATCase/OTCase
3469,A0A1H8ZSE5,MKHYTSIHDIDNIKKWIEDAKTLKANPLQHVALGKHMTIGLLFFNS...,,,,,,170,237,ATCase/OTCase
...,...,...,...,...,...,...,...,...,...,...
5725,A0A3P6SS45,MRQRHFHVTVALVRRFSSAKENTDVTYYIPPKTFRPAYYHPMKPSQ...,,,,,,112,466,ribosomal-mitochondrial
5726,A0A1U7S9S7,MAALRKKWSELCRLVQAESATAAAAAATAAAEAGLGPCYPPVVASV...,,,,,,85,415,ribosomal-mitochondrial
5727,U1Q8C1,MRGLFPVDMSDGRAIRIQARNAVYPPAQAPNAVRLLADMELELQRV...,,,,,,342,470,biosynthesis of lantibiotics
5728,A0A4Q3GPX1,VLYSATHQCRLVPRMASAYNILRSSHPLLRLLADLQYQGIQYQFLP...,,,,,,317,421,biosynthesis of lantibiotics


In [18]:
df_reduced.to_csv(OUTPUT, sep=';', encoding='utf-8', index=False)