# Notes

**Data explanation**:
- `train.jsonl`: JSONL file with proteins for training. Each line is a sample with id, sequence, label and resolve (9712 proteins)
- `val.jsonl`: JSONL file with proteins for validation. Each line is a sample with id, sequence, label and resolved (1080 proteins)
- `new_pisces.jsonl`: JSONL file with proteins for testing. Each line is a sample with id, sequence, label and resolved (364 proteins)

**Proposed splits**:
- `sampled`: Randomly split sequences into `train`/`test` with 95/5% probability.

This is a well-known dataset used to validate the behavior of code and models. Only provided a `sampled` split for this purpose.

# Configs & Imports

In [1]:
from pathlib import Path

from pandas import DataFrame, read_json

import json

%load_ext autoreload
%autoreload 2

In [2]:
# Where the raw data is stored and where processed data will be deposited
data_path = Path('') / '..' / 'data' / 'secondary_structure'

train = data_path / 'train.jsonl'
validation = data_path / 'val.jsonl'
pisces = data_path / 'new_pisces.jsonl'

split_path = Path('') / '..' / 'splits' / 'secondary_structure'

# Obtain original dataset

In [3]:
train_set = read_json(path_or_buf=train, lines=True, dtype={"id": str, 
                                                            "sequence": str,
                                                            "label": str,
                                                            "resolved": str})

In [4]:
train_set

Unnamed: 0,id,sequence,label,resolved
0,1es5-A,VTKPTIAAVGGYAMNNGTGTTLYTKAADTRRSTGSTTKIMTAKVVL...,CCCCCCCCCEEEEEECCCCCEEEEECCCCCECCHHHHHHHHHHHHH...,0011111111111111111111111111111111111111111111...
1,2a6h-E,MAEPGIDKLFGMVDSKYRLTVVVAKRAQQLLRHGFKNTVLEPEERP...,CCCCCHHHHHHHCCCHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCC...,0111111111111111111111111111111111111111111111...
2,5b1a-P,MTHQTHAYHMVNPSPWPLTGALSALLMTSGLTMWFHFNSMTLLMIG...,CCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHCCCCHHHHHH...,0011111111111111111111111111111111111111111111...
3,5ehi-C,GTGSQGETLGEKWKKKLNQLSRKEFDLYKKSGITEVDRTEAKEGLK...,CCCCCCCCHHHHHHHHHHCCCHHHHHHHHHCCCEEEECHHHHHHHC...,0000001111111111111111111111111111111111111111...
4,5egf-A,HHHHHHAVAKDSTESKSWEPFSLSPIKDPQALHAALCSKNVIPVTS...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHCCCCCCCC...,0000000000000000011111111111111111111111111111...
...,...,...,...,...
9707,3tew-A,EVKQENRLLNESESSSQGLLGYYFSDLNFQAPMVVTSSTTGDLSIP...,CCCCCCCCCCCCCCCCCCEEEEEECCCCCCCEEEEEEECCCECCEC...,0000000000000011111111111111111111111111111111...
9708,4r5r-B,GKECDCSSPENPCCDAATCKLRPGAQCGEGLCCEQCKFKKKRTICR...,CCCCCCCCCCCCCECCCCCCECCCCCCCCCCCEECCEECCCCCEEE...,1111111111111111111111111111111111111111111111...
9709,5xe7-A,GSHMASMEVSEFEALRQHLMSVAYRLTGTVADAEDIVQEAWLRWDS...,CCCCCCCCHHHHHHHHHHHHHHHHHHCCCHHHHHCCHHHHHHCCCC...,0000000111111111111111111111111111111111111111...
9710,2obn-C,GMRLPLNQRVAILLHEGTTGTIGKTGLALLRYSEAPIVAVIDRNCA...,CCCCCCCCCEEEECCCCCCCCCCHHHHHHHHHCCCCEEEEECCCCC...,0001111111111111111111111111111111111111111111...


In [5]:
validation_set = read_json(path_or_buf=validation, lines=True, dtype={"id": str, 
                                                                      "sequence": str,
                                                                      "label": str,
                                                                      "resolved": str})

In [6]:
validation_set

Unnamed: 0,id,sequence,label,resolved
0,3gwq-A,GMKVTNYQGATIDPYSKGLGMVPGTSIQLTDAARLEWNLLNEDVSL...,CCCCCCCCCCECCCCCCCCECCCCCCCEHHHHHHHCCEHHHCCECC...,0000000001111111111111111111111111111111111111...
1,5amh-A,AMPLDAGGQNSTQMVLAPGASIFRCRQCGQTISRRDWLLPMGGDHE...,CCCCCCCCCCCCCCCCCCCCEEEEECCCCCEEEEHHHECCHHHCCE...,0000000000000000001111111111111111111111111111...
2,4w4k-B,MHFEAYPPEVNSANIYAGPGPDSMLAAARAWRSLDVEMTAVQRSFN...,CCHHHCCHHHHHHHHHHCCCCHHHHHHHHHHHHHHHHHHHHHHHHH...,0111111111111111111111111111111111111111111111...
3,1t33-B,MNIPTTTTKGEQAKSQLIAAALAQFGEYGLHATTRDIAALAGQNIA...,CCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHCCHHHHHHHHCCCCH...,0000001111111111111111111111111111111111111111...
4,5c8h-A,GTSYENSLLVKQSGSLPLSSLTHVLRSLTPNARGIFRLLIKYQLDN...,CCCCCCCCCCCCCCCCCHHHHHHHHCCCCHHHHHHHHHHHHHHHHC...,0000000000000011111111111111111111111111111111...
...,...,...,...,...
1075,1uuy-A,VPGPEYKVAILTVSDTVSAGAGPDRSGPRAVSVVDSSSEKLGGAKV...,CCCCCEEEEEEEECHHHHCCCCCCCHHHHHHHHHHHCCCCCCCEEE...,0011111111111111111111111111111111111111111111...
1076,2e6f-B,MMCLKLNLLDHVFANPFMNAAGVLCSTEEDLRCMTASSSGALVSKS...,CCCCCEEECCEEECCCEEECCCCCCCCHHHHHHHHHCCCCCEECCC...,1011111111111111111111111111111111111111111111...
1077,3can-A,SNAGGGVTFCGGEPLLHPEFLIDILKRCGQQGIHRAVDTTLLARKE...,CCCCCCEEECCCCHHHCHHHHHHHHHHHHHCCCCEEEECCCCCCHH...,0001111111111111111111111111111111111111111111...
1078,1yi9-A,CLGTIGPVTPLDASDFALDIRMPGVTPKESDTYFCMSMRLPVDEEA...,CCCCCCCEEECCCCEEEEEEECCCECCCCCCEEEEEEEECCCCCCE...,1111111111111111111111111111111111111111111111...


In [7]:
test_set = read_json(path_or_buf=pisces, lines=True, dtype={"id": str, 
                                                            "sequence": str,
                                                            "label": str,
                                                            "resolved": str})

In [8]:
test_set

Unnamed: 0,id,sequence,label,resolved
0,6o41-O,MTPAVTTYKLVINGKTLKGETTTKAVDAETAEKAFKQYANDNGVDG...,CCCCCEEEEEEEECCCEEEEEEEEECCHHHHHHHHHHHHHHCCCCC...,0000111111111111111111111111111111111111111111...
1,6o43-A,MNDQEKIDKFTHSYINDDFGLTIDQLVPKVKGYGRFNVWLGGNESK...,CCHHHHHHHHHCCECCCCCCCCHHHHHHHHCCCHHHHHCCCCCHHH...,0111111111111111111111111111111111111111111111...
2,6o5k-B,GPGFMRDSGSKASSDSQDANQCCTSCEDNAPATSYCVECSEPLCET...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCEEECCCCEEECHH...,0000000000000000000000111111111111111111111111...
3,6o6d-A,SNALSRNEVLLNGDINFKEVRCVGDNGEVYGIISSKEALKIAQNLG...,CCCCCCCCCCEHHHCCCCCEEEEECCCCEEEEECHHHHHHHHHHHC...,0000000111111111111111111111111111111111111111...
4,6o6j-A,MAKGKSEVVEQNHTLILGWSDKLGSLLNQLAIANESLGGGTIAVMA...,CCCCCCCCCCCCCEEEECCCCCHHHHHHHHHHHHHHHCCCEEEEEE...,0000000000111111111111111111111111111111111111...
...,...,...,...,...
359,6to0-B,MNITGKGAYDTGTYANLFQRSGYREDEIKARLEQTWNDLFYGDEHT...,CCCCCCCHHHHCCCCCHHHHCCCCHHHHHHHHHHHHHHHHHCCCCC...,0000011111111111111111111111111111111111111111...
360,6tug-B,SNAMKILFSPIGNTDPWRNDRDGAMLHIVRHYQPDRVVLFFTESIW...,CCCCEEEEEECCCCCCEECCEECHHHHHHHHHCCCEEEEEEEHHHH...,0011111111111111111111111111111111111111111111...
361,6tv1-A,SPMMPDEIKYEDYRESLNLPDIVANGALPIGLDYEGVTLQKIKLTE...,CCCCCCCCCHHHHHHHCCHHHHHHCCEEEEEEECCCCCEEEEECCC...,1111111111111111111111111111111111111111111111...
362,6tx2-A,MGHHHHHHLRKEVENHYKLSLPEDFYHFWKFCEELDPEKPSDSLSA...,CCCCCCCCCHHHHHHHHCCCCCHHHHHHHHHHHHHCCCCHHHCCHH...,0000001111111111111111111111111111111111111111...


# Splits

In [9]:
# Let's create a sequences.fasta file with all the sequences
sequences = DataFrame(columns = ["id", "sequence"])
sequences = sequences.append(train_set[["id", "sequence"]])
sequences = sequences.append(validation_set[["id", "sequence"]])
sequences = sequences.append(test_set[["id", "sequence"]])
sequences = sequences.reset_index(drop = True)

with open(split_path / 'splits' / 'sequences.fasta', 'w') as sequences_file:
    for index, row in sequences.iterrows():
        sequences_file.write('>{}\n'.format(row['id']))
        sequences_file.write('{}\n'.format(row['sequence']))

In [10]:
sequences

Unnamed: 0,id,sequence
0,1es5-A,VTKPTIAAVGGYAMNNGTGTTLYTKAADTRRSTGSTTKIMTAKVVL...
1,2a6h-E,MAEPGIDKLFGMVDSKYRLTVVVAKRAQQLLRHGFKNTVLEPEERP...
2,5b1a-P,MTHQTHAYHMVNPSPWPLTGALSALLMTSGLTMWFHFNSMTLLMIG...
3,5ehi-C,GTGSQGETLGEKWKKKLNQLSRKEFDLYKKSGITEVDRTEAKEGLK...
4,5egf-A,HHHHHHAVAKDSTESKSWEPFSLSPIKDPQALHAALCSKNVIPVTS...
...,...,...
11151,6to0-B,MNITGKGAYDTGTYANLFQRSGYREDEIKARLEQTWNDLFYGDEHT...
11152,6tug-B,SNAMKILFSPIGNTDPWRNDRDGAMLHIVRHYQPDRVVLFFTESIW...
11153,6tv1-A,SPMMPDEIKYEDYRESLNLPDIVANGALPIGLDYEGVTLQKIKLTE...
11154,6tx2-A,MGHHHHHHLRKEVENHYKLSLPEDFYHFWKFCEELDPEKPSDSLSA...


In [11]:
# Let's create a resolved.fasta file with all the resolveds
resolved = DataFrame(columns = ["id", "resolved"])
resolved = resolved.append(train_set[["id", "resolved"]])
resolved = resolved.append(validation_set[["id", "resolved"]])
resolved = resolved.append(test_set[["id", "resolved"]])
resolved = resolved.reset_index(drop = True)

with open(split_path / 'splits' / 'mask.fasta', 'w') as resolved_file:
    for index, row in resolved.iterrows():
        resolved_file.write('>{}\n'.format(row['id']))
        resolved_file.write('{}\n'.format(row['resolved']))

In [12]:
resolved

Unnamed: 0,id,resolved
0,1es5-A,0011111111111111111111111111111111111111111111...
1,2a6h-E,0111111111111111111111111111111111111111111111...
2,5b1a-P,0011111111111111111111111111111111111111111111...
3,5ehi-C,0000001111111111111111111111111111111111111111...
4,5egf-A,0000000000000000011111111111111111111111111111...
...,...,...
11151,6to0-B,0000011111111111111111111111111111111111111111...
11152,6tug-B,0011111111111111111111111111111111111111111111...
11153,6tv1-A,1111111111111111111111111111111111111111111111...
11154,6tx2-A,0000001111111111111111111111111111111111111111...


## sampled

In [13]:
# Let's create the split dataset
sampled = DataFrame(columns = ["id", "label", "set", "validation"])

# Add train samples
tmp = train_set[["id", "label"]]
tmp.insert(2, "set", ["train"]*len(tmp))
sampled = sampled.append(tmp)

# Add validation samples
tmp = validation_set[["id", "label"]]
tmp.insert(2, "set", ["train"]*len(tmp))
tmp.insert(2, "validation", [True]*len(tmp))
sampled = sampled.append(tmp)

# Add test samples
tmp = test_set[["id", "label"]]
tmp.insert(2, "set", ["test"]*len(tmp))
sampled = sampled.append(tmp)

sampled = sampled.reset_index(drop = True)

In [14]:
sampled

Unnamed: 0,id,label,set,validation
0,1es5-A,CCCCCCCCCEEEEEECCCCCEEEEECCCCCECCHHHHHHHHHHHHH...,train,
1,2a6h-E,CCCCCHHHHHHHCCCHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCC...,train,
2,5b1a-P,CCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHCCCCHHHHHH...,train,
3,5ehi-C,CCCCCCCCHHHHHHHHHHCCCHHHHHHHHHCCCEEEECHHHHHHHC...,train,
4,5egf-A,CCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHCCCCCCCC...,train,
...,...,...,...,...
11151,6to0-B,CCCCCCCHHHHCCCCCHHHHCCCCHHHHHHHHHHHHHHHHHCCCCC...,test,
11152,6tug-B,CCCCEEEEEECCCCCCEECCEECHHHHHHHHHCCCEEEEEEEHHHH...,test,
11153,6tv1-A,CCCCCCCCCHHHHHHHCCHHHHHHCCEEEEEEECCCCCEEEEECCC...,test,
11154,6tx2-A,CCCCCCCCCHHHHHHHHCCCCCHHHHHHHHHHHHHCCCCHHHCCHH...,test,


In [15]:
with open(split_path / 'splits' / 'sampled.fasta', 'w') as sampled_file:
    for index, row in sampled.iterrows():
        validation = 'True' if row['validation'] == True else 'False'
        sampled_file.write('>{}\n'.format('{} SET={} VALIDATION={}'.format(row['id'], row['set'], validation)))
        sampled_file.write('{}\n'.format(row['label']))