# Notes

Used dataset ... 

**Data explanation**:
- `train.jsonl`: JSONL file with proteins for training. Each line is a sample with id, sequence, label and resolved (ignored) (9712 proteins)
- `val.jsonl`: JSONL file with proteins for validation. Each line is a sample with id, sequence, label and resolved (ignored) (1080 proteins)
- `new_pisces.jsonl`: JSONL file with proteins for testing. Each line is a sample with id, sequence, label and resolved (ignored) (648 proteins)

**Proposed splits**:
- `sampled`: Randomly split sequences into `train`/`test` with 95/5% probability.

This is a well-known dataset used to validate the behavior of code and models. Only provided a `sampled` split for this purpose.

# Configs & Imports

In [1]:
from pathlib import Path

from pandas import DataFrame, read_json

import json

%load_ext autoreload
%autoreload 2

In [2]:
# Where the raw data is stored and where processed data will be deposited
data_path = Path('') / '..' / 'data' / 'secondary_struture'

train = data_path / 'train.jsonl'
validation = data_path / 'val.jsonl'
casp12 = data_path / 'casp12.jsonl' # TODO: What is this?
pisces = data_path / 'new_pisces.jsonl' # TODO: What is this?

split_path = Path('') / '..' / 'splits' / '2STR'

# Obtain original dataset

In [3]:
train_set = read_json(path_or_buf=train, lines=True, dtype={"id": str, 
                                                            "sequence": str,
                                                            "label": str,
                                                            "resolved": str})

In [4]:
train_set

Unnamed: 0,id,sequence,label,resolved
0,1es5-A,VTKPTIAAVGGYAMNNGTGTTLYTKAADTRRSTGSTTKIMTAKVVL...,CCCCCCCCCEEEEEECCCCCEEEEECCCCCECCHHHHHHHHHHHHH...,0011111111111111111111111111111111111111111111...
1,2a6h-E,MAEPGIDKLFGMVDSKYRLTVVVAKRAQQLLRHGFKNTVLEPEERP...,CCCCCHHHHHHHCCCHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCC...,0111111111111111111111111111111111111111111111...
2,5b1a-P,MTHQTHAYHMVNPSPWPLTGALSALLMTSGLTMWFHFNSMTLLMIG...,CCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHCCCCHHHHHH...,0011111111111111111111111111111111111111111111...
3,5ehi-C,GTGSQGETLGEKWKKKLNQLSRKEFDLYKKSGITEVDRTEAKEGLK...,CCCCCCCCHHHHHHHHHHCCCHHHHHHHHHCCCEEEECHHHHHHHC...,0000001111111111111111111111111111111111111111...
4,5egf-A,HHHHHHAVAKDSTESKSWEPFSLSPIKDPQALHAALCSKNVIPVTS...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHCCCCCCCC...,0000000000000000011111111111111111111111111111...
...,...,...,...,...
9707,3tew-A,EVKQENRLLNESESSSQGLLGYYFSDLNFQAPMVVTSSTTGDLSIP...,CCCCCCCCCCCCCCCCCCEEEEEECCCCCCCEEEEEEECCCECCEC...,0000000000000011111111111111111111111111111111...
9708,4r5r-B,GKECDCSSPENPCCDAATCKLRPGAQCGEGLCCEQCKFKKKRTICR...,CCCCCCCCCCCCCECCCCCCECCCCCCCCCCCEECCEECCCCCEEE...,1111111111111111111111111111111111111111111111...
9709,5xe7-A,GSHMASMEVSEFEALRQHLMSVAYRLTGTVADAEDIVQEAWLRWDS...,CCCCCCCCHHHHHHHHHHHHHHHHHHCCCHHHHHCCHHHHHHCCCC...,0000000111111111111111111111111111111111111111...
9710,2obn-C,GMRLPLNQRVAILLHEGTTGTIGKTGLALLRYSEAPIVAVIDRNCA...,CCCCCCCCCEEEECCCCCCCCCCHHHHHHHHHCCCCEEEEECCCCC...,0001111111111111111111111111111111111111111111...


In [5]:
validation_set = read_json(path_or_buf=validation, lines=True, dtype={"id": str, 
                                                                      "sequence": str,
                                                                      "label": str,
                                                                      "resolved": str})

In [6]:
validation_set

Unnamed: 0,id,sequence,label,resolved
0,3gwq-A,GMKVTNYQGATIDPYSKGLGMVPGTSIQLTDAARLEWNLLNEDVSL...,CCCCCCCCCCECCCCCCCCECCCCCCCEHHHHHHHCCEHHHCCECC...,0000000001111111111111111111111111111111111111...
1,5amh-A,AMPLDAGGQNSTQMVLAPGASIFRCRQCGQTISRRDWLLPMGGDHE...,CCCCCCCCCCCCCCCCCCCCEEEEECCCCCEEEEHHHECCHHHCCE...,0000000000000000001111111111111111111111111111...
2,4w4k-B,MHFEAYPPEVNSANIYAGPGPDSMLAAARAWRSLDVEMTAVQRSFN...,CCHHHCCHHHHHHHHHHCCCCHHHHHHHHHHHHHHHHHHHHHHHHH...,0111111111111111111111111111111111111111111111...
3,1t33-B,MNIPTTTTKGEQAKSQLIAAALAQFGEYGLHATTRDIAALAGQNIA...,CCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHCCHHHHHHHHCCCCH...,0000001111111111111111111111111111111111111111...
4,5c8h-A,GTSYENSLLVKQSGSLPLSSLTHVLRSLTPNARGIFRLLIKYQLDN...,CCCCCCCCCCCCCCCCCHHHHHHHHCCCCHHHHHHHHHHHHHHHHC...,0000000000000011111111111111111111111111111111...
...,...,...,...,...
1075,1uuy-A,VPGPEYKVAILTVSDTVSAGAGPDRSGPRAVSVVDSSSEKLGGAKV...,CCCCCEEEEEEEECHHHHCCCCCCCHHHHHHHHHHHCCCCCCCEEE...,0011111111111111111111111111111111111111111111...
1076,2e6f-B,MMCLKLNLLDHVFANPFMNAAGVLCSTEEDLRCMTASSSGALVSKS...,CCCCCEEECCEEECCCEEECCCCCCCCHHHHHHHHHCCCCCEECCC...,1011111111111111111111111111111111111111111111...
1077,3can-A,SNAGGGVTFCGGEPLLHPEFLIDILKRCGQQGIHRAVDTTLLARKE...,CCCCCCEEECCCCHHHCHHHHHHHHHHHHHCCCCEEEECCCCCCHH...,0001111111111111111111111111111111111111111111...
1078,1yi9-A,CLGTIGPVTPLDASDFALDIRMPGVTPKESDTYFCMSMRLPVDEEA...,CCCCCCCEEECCCCEEEEEEECCCECCCCCCEEEEEEEECCCCCCE...,1111111111111111111111111111111111111111111111...


In [7]:
test_set = read_json(path_or_buf=pisces, lines=True, dtype={"id": str, 
                                                            "sequence": str,
                                                            "label": str,
                                                            "resolved": str})

In [8]:
test_set

Unnamed: 0,id,sequence,label,resolved
0,6pwh-B,SLFELGKMILQETGKNPAKSYGVYGCNCGVGGRGKPKDATDRCCYV...,CHHHHHHHHHHHHCCCHHHHHCECCCCCCCCCCCCCCCHHHHHHHH...,1111111111111111111111111111111111111111111111...
1,6pws-A,ANGSVCNTCPEAWIYFQKKCYYFGEGAKKWIQARYACENLHGRLVS...,CCCCCCCECCCCCEEECCEEEEEEEEEECHHHHHHHHHHCCCEECC...,0000011111111111111111111111111111111111111111...
2,6pxe-T,MEYYKQSKYETYSEIIEKERTARFESVALEQLQIVHISSEADFSAV...,CCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHCCCEEEE...,0000000000000000011111111111111111111111111111...
3,6pxu-C,GAGATGAGAGYYITPRTGAGA,CCCCCCCCCCCCCCCCCCCCC,000011000011111111111
4,6pyo-B,ETGIGQRIVCLVLDKSGSMATGNRLNRLNQAGQLFLLQTVELGSWV...,CCCCCCCEEEEEEECCHHHHCCCHHHHHHHHHHHHHHCCCCCCCEE...,0000111111111111111111111111111111111111111111...
...,...,...,...,...
643,6p82-D,SNALSIDEAFRKFKSRLELNEREQKNASQRQNEVRDYLQTKFGIAR...,CCCCCHHHHHHHHHHHCCCCHHHHHHHHHHHHHHHHHHHCCCCEEE...,0111111111111111111111111111111111111111111111...
644,6p8o-A,MSTVATYSYTHSVTYVTDNILKSLKDIILLSGLDPEHFADRWESNT...,CCEEEEEECCCCHHHHHHHHHHHHHHHHHHHCCCCHHHHHCHHHHH...,0111111111111111111111111111111111111111111111...
645,6p8p-D,MTTVVSRTFRSSPHRDALQTWDAIVELLTQGKDGTARSELRAVTGV...,CCCEEEEEEECCCCCCHHHHHHHHHHHHHCCCCCHHHHHHHHCHHH...,0111111111111111111111111111111111111111111111...
646,6p8r-B,SNASTVATYSYTHSVTYVTDNILKSLKDIILLSGLDPEHFADRWES...,CCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHCCCCCHHHHHCHHH...,1111111111111111111111111111111111111111111111...


# Splits

## sampled

In [9]:
# Let's create the split dataset
sampled = DataFrame(columns = ["sequence", "target", "set", "validation"])

# Add train samples
tmp = train_set[["sequence", "label"]]
tmp.rename(columns = {"label": "target"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(tmp))
sampled = sampled.append(tmp)

# Add validation samples
tmp = validation_set[["sequence", "label"]]
tmp.rename(columns = {"label": "target"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(tmp))
tmp.insert(2, "validation", [True]*len(tmp))
sampled = sampled.append(tmp)

# Add test samples
tmp = test_set[["sequence", "label"]]
tmp.rename(columns = {"label": "target"}, inplace = True)
tmp.insert(2, "set", ["test"]*len(tmp))
sampled = sampled.append(tmp)

sampled = sampled.reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [10]:
sampled

Unnamed: 0,sequence,target,set,validation
0,VTKPTIAAVGGYAMNNGTGTTLYTKAADTRRSTGSTTKIMTAKVVL...,CCCCCCCCCEEEEEECCCCCEEEEECCCCCECCHHHHHHHHHHHHH...,train,
1,MAEPGIDKLFGMVDSKYRLTVVVAKRAQQLLRHGFKNTVLEPEERP...,CCCCCHHHHHHHCCCHHHHHHHHHHHHHHHHHCCCCCCCCCCCCCC...,train,
2,MTHQTHAYHMVNPSPWPLTGALSALLMTSGLTMWFHFNSMTLLMIG...,CCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHCCCCHHHHHH...,train,
3,GTGSQGETLGEKWKKKLNQLSRKEFDLYKKSGITEVDRTEAKEGLK...,CCCCCCCCHHHHHHHHHHCCCHHHHHHHHHCCCEEEECHHHHHHHC...,train,
4,HHHHHHAVAKDSTESKSWEPFSLSPIKDPQALHAALCSKNVIPVTS...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHCCCCCCCC...,train,
...,...,...,...,...
11435,SNALSIDEAFRKFKSRLELNEREQKNASQRQNEVRDYLQTKFGIAR...,CCCCCHHHHHHHHHHHCCCCHHHHHHHHHHHHHHHHHHHCCCCEEE...,test,
11436,MSTVATYSYTHSVTYVTDNILKSLKDIILLSGLDPEHFADRWESNT...,CCEEEEEECCCCHHHHHHHHHHHHHHHHHHHCCCCHHHHHCHHHHH...,test,
11437,MTTVVSRTFRSSPHRDALQTWDAIVELLTQGKDGTARSELRAVTGV...,CCCEEEEEEECCCCCCHHHHHHHHHHHHHCCCCCHHHHHHHHCHHH...,test,
11438,SNASTVATYSYTHSVTYVTDNILKSLKDIILLSGLDPEHFADRWES...,CCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHCCCCCHHHHHCHHH...,test,


In [11]:
sampled.to_csv(split_path / 'splits' / 'sampled.csv', index = False)