# Generate patched HF dataset:

For each sequence in raw format, there will be created a set of new patched versions of the same sequence. The ID will be kept in format `{orig_id}_{patch_size}_{patch_start_index}` for easier processing of the results.

Load subset of test with label 1 - 1000 random samples for each family *(if there were less than 1000 samples, they were all taken)*:

In [1]:
INPUT = '/home/jovyan/data/proteins_m3_v2/raw_inputs/raw_new_data_with_core_intervals.csv'
CSV_DELIMITER = ','

OVERLAP_STEP = 1
PATCH_CHAR = 'X'
PATCH_SIZE = 100
HF_DATASET = f'roa7n/patched_1000_test_p_{PATCH_SIZE}'

In [2]:
import csv

with open(INPUT, newline='') as f:
    reader = csv.reader(f, delimiter=CSV_DELIMITER)
    data = list(reader)

data[:2]

[['seq_id',
  'sequence',
  'global_metric_value',
  'domain_architecture',
  'interpro',
  'max_knot_topology',
  'seq_length',
  'label',
  'family',
  'knot_start',
  'knot_end',
  'knot_len',
  'core_percentage'],
 ['A0A533UME0',
  'MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHESSGSDRDRSLIRTILKYLETPQYLRRGLFQKISELKFAGSLSPLKIPHHTYTSDSHKIKAGDIREGMIVFAKGRKFVDVGLDQIITYSGEDKEGKRVTMQFKTGYPELLAKQISRNEIKQYWGYEVKESANLRTLLSGWNSNVILTTKKGKTIHKVQKYFDEISNNPVLVVFGSPERGIHEILGISIKEIPKSQNLNFFPEQATETVRLEEAILGTLAILNILIRN',
  '92.81',
  'PF02598;',
  'IPR029028;IPR012340;IPR003750;IPR029026;',
  '3_1',
  '271',
  '1',
  'SPOUT',
  '38',
  '246',
  '208',
  '76.75276752767527']]

Generate patched versions:

In [3]:
def patch_sequence(sequence_id, sequence, patch_size, overlap_step, patch_char):
    patched_sequences = [[f'{sequence_id}_{patch_size}_-1', sequence, 1]]
    patch = patch_char * patch_size
    last_patch_start_i = len(sequence) - patch_size + 1
    
    for i in range(0, last_patch_start_i, overlap_step):
        patched_seq = sequence[:i] + patch + sequence[i+patch_size:]
        patched_sequences.append([f'{sequence_id}_{patch_size}_{i}', patched_seq, 1])

    return patched_sequences

In [4]:
new_sequences = []
for i in range(1, len(data)):
    seq_id = data[i][0]
    seq_str = data[i][1]
    patched_versions = patch_sequence(seq_id, seq_str, PATCH_SIZE, OVERLAP_STEP, PATCH_CHAR)
    new_sequences += patched_versions
        
    if i % 1000 == 0:
        print(f'Calculated sequences [{i}/{len(data)}].')

len(new_sequences)

Calculated sequences [1000/5731].
Calculated sequences [2000/5731].
Calculated sequences [3000/5731].
Calculated sequences [4000/5731].
Calculated sequences [5000/5731].


1319717

In [5]:
import pandas as pd

df = pd.DataFrame(new_sequences, columns =['id','sequence_str', 'label'])
df

Unnamed: 0,id,sequence_str,label
0,A0A533UME0_100_-1,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,1
1,A0A533UME0_100_0,XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,1
2,A0A533UME0_100_1,MXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,1
3,A0A533UME0_100_2,MKXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,1
4,A0A533UME0_100_3,MKLXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...,1
...,...,...,...
1319712,A0A6A4IYK5_100_232,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1
1319713,A0A6A4IYK5_100_233,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1
1319714,A0A6A4IYK5_100_234,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1
1319715,A0A6A4IYK5_100_235,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1


Push to HF hub:

In [6]:
from datasets import Dataset, DatasetDict

hf_dataset = Dataset.from_pandas(df)
hf_dataset

Dataset({
    features: ['id', 'sequence_str', 'label'],
    num_rows: 1319717
})

In [7]:
# import the relavant libraries for loggin in
from huggingface_hub import HfApi, HfFolder

# set api for login and save token
api=HfApi()
api.set_access_token('')

hf_dataset.push_to_hub(HF_DATASET)



Pushing dataset shards to the dataset hub:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# from huggingface_hub import notebook_login

# notebook_login()

In [9]:
# dataset.push_to_hub('roa7n/patched_1000_test_p_20_50_80_100_150')