# Computing overlaps of knot cores with minimums

In [1]:
from datasets import Dataset, load_dataset
import pandas as pd
from tqdm import tqdm

In [2]:
PATCH_SIZE = 40
HF_DATASET = f'roa7n/patched_1000_test_p_{PATCH_SIZE}_m1_predictions'
OUTPUT = f'/home/jovyan/data/proteins_m1/preds_{PATCH_SIZE}_minimums.csv'

In [3]:
tqdm.pandas()

In [4]:
hf_dataset = load_dataset(HF_DATASET)
hf_dataset

Downloading readme:   0%|          | 0.00/497 [00:00<?, ?B/s]

Using custom data configuration roa7n--patched_1000_test_p_40_m1_predictions-bf4ebb9dbf4b17fe


Downloading and preparing dataset None/None to /home/jovyan/.cache/huggingface/datasets/roa7n___parquet/roa7n--patched_1000_test_p_40_m1_predictions-bf4ebb9dbf4b17fe/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/29.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/31.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1663294 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/roa7n___parquet/roa7n--patched_1000_test_p_40_m1_predictions-bf4ebb9dbf4b17fe/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'sequence_str', 'label', 'm1_preds'],
        num_rows: 1663294
    })
})

In [5]:
df = hf_dataset['train'].to_pandas()
print(df.shape)

(1663294, 4)


In [6]:
df

Unnamed: 0,id,sequence_str,label,m1_preds
0,A0A533UME0_40_-1,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,1,0.993164
1,A0A533UME0_40_0,XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXYIYHES...,1,0.994141
2,A0A533UME0_40_1,MXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXIYHES...,1,0.994141
3,A0A533UME0_40_2,MKXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXYHES...,1,0.993652
4,A0A533UME0_40_3,MKLXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXHES...,1,0.993652
...,...,...,...,...
1663289,A0A6A4IYK5_40_292,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1,0.995117
1663290,A0A6A4IYK5_40_293,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1,0.995117
1663291,A0A6A4IYK5_40_294,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1,0.995117
1663292,A0A6A4IYK5_40_295,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1,0.995117


## Calculate minimum for each sequence

1. Filter out all predictions of one patched sequence
2. Check that the original sequence was predicted as having a knot
3. Take minimum out of patched versions
4. Save

1. Create a list of IDs from original CSV file:

In [7]:
INPUT = '/home/jovyan/data/proteins_m3_v2/raw_inputs/raw_new_data_with_core_intervals.csv'
CSV_DELIMITER = ','

import csv

with open(INPUT, newline='') as f:
    reader = csv.reader(f, delimiter=CSV_DELIMITER)
    data = list(reader)

data[:2]

[['seq_id',
  'sequence',
  'global_metric_value',
  'domain_architecture',
  'interpro',
  'max_knot_topology',
  'seq_length',
  'label',
  'family',
  'knot_start',
  'knot_end',
  'knot_len',
  'core_percentage'],
 ['A0A533UME0',
  'MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHESSGSDRDRSLIRTILKYLETPQYLRRGLFQKISELKFAGSLSPLKIPHHTYTSDSHKIKAGDIREGMIVFAKGRKFVDVGLDQIITYSGEDKEGKRVTMQFKTGYPELLAKQISRNEIKQYWGYEVKESANLRTLLSGWNSNVILTTKKGKTIHKVQKYFDEISNNPVLVVFGSPERGIHEILGISIKEIPKSQNLNFFPEQATETVRLEEAILGTLAILNILIRN',
  '92.81',
  'PF02598;',
  'IPR029028;IPR012340;IPR003750;IPR029026;',
  '3_1',
  '271',
  '1',
  'SPOUT',
  '38',
  '246',
  '208',
  '76.75276752767527']]

In [8]:
df_raw = pd.DataFrame(data[1:], columns=data[0])
del(data)
df_raw

Unnamed: 0,seq_id,sequence,global_metric_value,domain_architecture,interpro,max_knot_topology,seq_length,label,family,knot_start,knot_end,knot_len,core_percentage
0,A0A533UME0,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,92.81,PF02598;,IPR029028;IPR012340;IPR003750;IPR029026;,3_1,271,1,SPOUT,38,246,208,76.75276752767527
1,A0A2S2NXP0,MKAIGSSFSVNFRRILCHNVTARTYVSDTKGSDKKQSNDDILNSIK...,77.0,PF00588;,IPR029028;IPR029064;IPR001537;IPR013123;IPR029...,3_1,376,1,SPOUT,95,307,212,56.38297872340426
2,A0A154BTR5,MSSVYLGLLHHPIYNKNDEIVATAVTNFDIHDIARAARTYDISRYF...,92.88,PF09936;,IPR029028;IPR019230;IPR029026;,3_1,192,1,SPOUT,108,156,48,25.0
3,A0A5J4Z539,MNAIAFGCSSQTWCTVRARHETRIRVRTCASGNARVGEGSNDRKAA...,73.0,PF04452;,IPR029028;IPR006700;IPR046886;IPR029026;,3_1,381,1,SPOUT,287,351,64,16.79790026246719
4,A0A1Q9VZN3,MRWGRAVPDAPVTVTDPTDTRLDDIRDLNSSDRRPDLPGGKGLVVA...,92.19,PF00588;,IPR029028;IPR029064;IPR001537;IPR029026;,3_1,288,1,SPOUT,219,261,42,14.583333333333334
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5725,A0A3P6SS45,MRQRHFHVTVALVRRFSSAKENTDVTYYIPPKTFRPAYYHPMKPSQ...,79.38,PF07147;,IPR039982;IPR010793;,3_1,499,1,ribosomal-mitochondrial,112,466,354,70.94188376753507
5726,A0A1U7S9S7,MAALRKKWSELCRLVQAESATAAAAAATAAAEAGLGPCYPPVVASV...,86.5,PF07147;,IPR039982;IPR010793;,3_1,433,1,ribosomal-mitochondrial,85,415,330,76.21247113163972
5727,U1Q8C1,MRGLFPVDMSDGRAIRIQARNAVYPPAQAPNAVRLLADMELELQRV...,89.88,PF14028;PF04738;,IPR006827;IPR023809;,3_1,490,1,biosynthesis of lantibiotics,342,470,128,26.122448979591837
5728,A0A4Q3GPX1,VLYSATHQCRLVPRMASAYNILRSSHPLLRLLADLQYQGIQYQFLP...,87.94,PF14028;PF04738;,IPR006827;IPR023809;,3_1,444,1,biosynthesis of lantibiotics,317,421,104,23.423423423423422


In [9]:
raw_ids = list(df_raw['seq_id'])
len(raw_ids)

5730

In [10]:
df_raw = df_raw.set_index('seq_id')
df = df.set_index('id')
df.index = df.index.astype('str')

2. Iterate over IDs in the list:

In [11]:
df[df.index.str.contains('A0A533UME0')]

Unnamed: 0_level_0,sequence_str,label,m1_preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A0A533UME0_40_-1,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,1,0.993164
A0A533UME0_40_0,XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXYIYHES...,1,0.994141
A0A533UME0_40_1,MXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXIYHES...,1,0.994141
A0A533UME0_40_2,MKXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXYHES...,1,0.993652
A0A533UME0_40_3,MKLXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXHES...,1,0.993652
...,...,...,...
A0A533UME0_40_227,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,1,0.034943
A0A533UME0_40_228,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,1,0.030334
A0A533UME0_40_229,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,1,0.042358
A0A533UME0_40_230,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,1,0.592285


In [12]:
import numpy as np

def extract_seq_min_info(df):
    preds = list(df['m1_preds'])
    start_ids = list(df.index)
    starts = []
    for id_str in start_ids:
        starts.append(id_str.split('_')[2])  # take the start index of patch
    min_pred = min(preds)
    min_i = starts[np.argmin(preds)]
    return preds, starts, min_pred, min_i

In [13]:
reduced_data = []

for i in range(len(raw_ids)):
    raw_id = raw_ids[i]
    raw_seq_info = df_raw.loc[raw_id]
    
    if f'{raw_id}_{PATCH_SIZE}_-1' in df.index:
        seq_info = df.loc[f'{raw_id}_{PATCH_SIZE}_-1']
        
        df_seq = df[df.index.str.contains(raw_id)].copy()
        patched_preds, patched_starts, min_pred, min_start = extract_seq_min_info(df_seq)
        del(df_seq)
        
        seq_dict = {'id': raw_id,
                    'sequence_str': raw_seq_info['sequence'],
                    'sequence_pred': seq_info['m1_preds'],
                    'patched_starts': patched_starts,
                    'patched_preds': patched_preds,
                    'min_start': min_start,
                    'min_pred': min_pred,
                    'knot_start': raw_seq_info['knot_start'],
                    'knot_end': raw_seq_info['knot_end'],
                    'family': raw_seq_info['family']}        
    else:
        seq_dict = {'id': raw_id,
                    'sequence_str': raw_seq_info['sequence'],
                    'sequence_pred': None,
                    'patched_starts': None,
                    'patched_preds': None,
                    'min_start': None,
                    'min_pred': None,
                    'knot_start': raw_seq_info['knot_start'],
                    'knot_end': raw_seq_info['knot_end'],
                    'family': raw_seq_info['family']} 
        
    if i % 500 == 0:
        print(f'[{i:4}/{len(raw_ids)}]')
    
    reduced_data.append(seq_dict)

[   0/5730]
[ 500/5730]
[1000/5730]
[1500/5730]
[2000/5730]
[2500/5730]
[3000/5730]
[3500/5730]
[4000/5730]
[4500/5730]
[5000/5730]
[5500/5730]


In [14]:
df_reduced = pd.DataFrame(reduced_data)
df_reduced

Unnamed: 0,id,sequence_str,sequence_pred,patched_starts,patched_preds,min_start,min_pred,knot_start,knot_end,family
0,A0A533UME0,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,0.993164,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.9931640625, 0.994140625, 0.994140625, 0.993...",228,0.030334,38,246,SPOUT
1,A0A2S2NXP0,MKAIGSSFSVNFRRILCHNVTARTYVSDTKGSDKKQSNDDILNSIK...,0.997559,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.99755859375, 0.99755859375, 0.99755859375, ...",304,0.966309,95,307,SPOUT
2,A0A154BTR5,MSSVYLGLLHHPIYNKNDEIVATAVTNFDIHDIARAARTYDISRYF...,0.996094,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.99609375, 0.99609375, 0.99609375, 0.9960937...",152,0.002371,108,156,SPOUT
3,A0A5J4Z539,MNAIAFGCSSQTWCTVRARHETRIRVRTCASGNARVGEGSNDRKAA...,0.996582,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.99658203125, 0.99658203125, 0.99658203125, ...",324,0.002867,287,351,SPOUT
4,A0A1Q9VZN3,MRWGRAVPDAPVTVTDPTDTRLDDIRDLNSSDRRPDLPGGKGLVVA...,0.995605,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.99560546875, 0.9951171875, 0.9951171875, 0....",248,0.000133,219,261,SPOUT
...,...,...,...,...,...,...,...,...,...,...
5725,A0A3P6SS45,MRQRHFHVTVALVRRFSSAKENTDVTYYIPPKTFRPAYYHPMKPSQ...,0.996582,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.99658203125, 0.99609375, 0.99609375, 0.9960...",401,0.994629,112,466,ribosomal-mitochondrial
5726,A0A1U7S9S7,MAALRKKWSELCRLVQAESATAAAAAATAAAEAGLGPCYPPVVASV...,0.979980,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.97998046875, 0.9697265625, 0.96923828125, 0...",387,0.051880,85,415,ribosomal-mitochondrial
5727,U1Q8C1,MRGLFPVDMSDGRAIRIQARNAVYPPAQAPNAVRLLADMELELQRV...,0.958984,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.958984375, 0.9736328125, 0.97265625, 0.9760...",179,0.886230,342,470,biosynthesis of lantibiotics
5728,A0A4Q3GPX1,VLYSATHQCRLVPRMASAYNILRSSHPLLRLLADLQYQGIQYQFLP...,0.954102,"[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","[0.9541015625, 0.966796875, 0.95751953125, 0.9...",37,0.055328,317,421,biosynthesis of lantibiotics


In [15]:
del(reduced_data)

In [16]:
df_reduced['min_start'].value_counts()

3      136
0      123
4      113
5      110
10     105
      ... 
364      1
339      1
453      1
394      1
387      1
Name: min_start, Length: 413, dtype: int64

In [17]:
df_reduced[df_reduced['patched_preds'].isnull()]

Unnamed: 0,id,sequence_str,sequence_pred,patched_starts,patched_preds,min_start,min_pred,knot_start,knot_end,family


In [18]:
df[df.index.str.contains('A0A6A4IYK5')]

Unnamed: 0_level_0,sequence_str,label,m1_preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A0A6A4IYK5_40_-1,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1,0.996094
A0A6A4IYK5_40_0,XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXGQFEDL...,1,0.994629
A0A6A4IYK5_40_1,MXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXQFEDL...,1,0.989258
A0A6A4IYK5_40_2,MSXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXFEDL...,1,0.992188
A0A6A4IYK5_40_3,MSYXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXEDL...,1,0.991211
...,...,...,...
A0A6A4IYK5_40_292,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1,0.995117
A0A6A4IYK5_40_293,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1,0.995117
A0A6A4IYK5_40_294,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1,0.995117
A0A6A4IYK5_40_295,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1,0.995117


In [19]:
df_reduced.to_csv(OUTPUT, sep=';', encoding='utf-8', index=False)