# Imports

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import random
import time
from datetime import datetime
from functools import partial
import json

# Sampler

## Basic sampler

In [2]:
def basic_rand_sampler(seq, sample_len):
    """
    Basic random text sampler.
    If sample_len is greater than the length of the seq, the seq is returned.
    """
    seq_len   = len(seq)
    if seq_len > sample_len:
        start_idx = random.randint(0, min(seq_len,seq_len - sample_len))
        end_idx   = start_idx+sample_len
        return seq[start_idx:end_idx]
    else:
        return seq

In [3]:
text = "ABC DEF GHI JKL!"

In [4]:
[basic_rand_sampler(text, 5) for i in range(3)]

['C DEF', 'ABC D', ' DEF ']

In [5]:
[basic_rand_sampler(text, 8) for i in range(6)]

[' GHI JKL', 'C DEF GH', 'ABC DEF ', 'F GHI JK', 'F GHI JK', ' GHI JKL']

In [6]:
[basic_rand_sampler(text, 100) for i in range(3)]

['ABC DEF GHI JKL!', 'ABC DEF GHI JKL!', 'ABC DEF GHI JKL!']

## Identity sampler

In [7]:
identity_sampler = lambda x: x

In [8]:
assert text == identity_sampler(text)

# Tokenizer

## Basic aminoacid tokenizer

In [9]:
def basic_aa_tokenizer(seq, context_length, return_mask=True):
    """
    Maps a number between 0 and 21 to each 21 proteogenic aminoacids.
    Unknown char input gets mapped to 22.
    """
    aa = "ACDEFGHIKLMNOPQRSTUVWY"
    d = {a: i for i, a in enumerate(aa)}
    seq_len = len(seq)
    seq_empty = torch.zeros(context_length - len(seq), dtype=torch.long)
    seq_tok   = torch.tensor([d[a] if a in aa else 22 for a in seq], dtype=torch.long)
    seq = torch.cat([seq_tok, seq_empty], dim=0)
    if return_mask:
        mask = torch.zeros_like(seq).bool()
        mask[0:seq_len] = True
        return seq, mask
    else:
        return seq

In [10]:
aa_seq = "ACDEFGHIKLMNOPQRSTUVWYZZZ"

In [11]:
tokens, mask = basic_aa_tokenizer(aa_seq, 30)

In [12]:
tokens, mask

(tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 22, 22,  0,  0,  0,  0,  0]),
 tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True, False, False, False, False, False]))

In [13]:
assert len(tokens[mask]) == len(aa_seq) # because we have a 1:1 relationship, unlike with the text

In [14]:
assert len(tokens[mask]) == mask.sum()

In [15]:
assert tokens[~mask].sum() == 0.

## Text tokenizer

In [16]:
from simple_tokenizer import tokenize

In [17]:
tokens, mask = tokenize(text, context_length=30, return_mask=True)

In [18]:
tokens, mask

(tensor([[ 5334, 11649, 22279,    73, 14134,   256,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[ True,  True,  True,  True,  True,  True, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False]]))

In [19]:
assert len(tokens[mask]) == mask.sum()

In [20]:
assert tokens[~mask].sum() == 0.

# Dataset

In [21]:
class CLASPDataset(Dataset):
    """
    Basic CLASP dataset that loads the preprocessed csv file into RAM.
        path: path to the csv file
    """
    def __init__(self, path, text_sampler, bioseq_sampler, text_tok, bioseq_tok):
        super().__init__()
        self.path = path

        tp = time.time()
        with open(path, "r") as reader:
            self.data = reader.readlines()
        print(f"Load data time: {time.time() - tp:.3f} s")

        self.cols = self.data.pop(0).split(",")
        self.len  = len(self.data)

        self.text_sampler   = text_sampler
        self.bioseq_sampler = bioseq_sampler

        self.text_tok   = text_tok
        self.bioseq_tok = bioseq_tok

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        sample = self.data[idx][:-2] # without "\n"
        sample = sample.split(",")
        sample = [x for x in sample if len(x) > 0]

        text   = " ".join(sample[:-2])
        bioseq = sample[-1]

        text   = self.text_sampler(text)
        bioseq = self.bioseq_sampler(bioseq)
        
        print(text, len(text))
        print(bioseq, len(bioseq))

        text, text_mask = self.text_tok(text)
        bioseq, bioseq_mask = self.bioseq_tok(bioseq)

        return text, text_mask, bioseq, bioseq_mask

In [22]:
str_sampler = partial(basic_rand_sampler, sample_len=100)
text_tok    = partial(tokenize, context_length=120, return_mask=True)
bioseq_tok  = partial(basic_aa_tokenizer, context_length=120, return_mask=True)

In [23]:
ds = CLASPDataset(path="uniprot_100_reduced.csv",
                  text_sampler=str_sampler,
                  bioseq_sampler=str_sampler,
                  text_tok=text_tok,
                  bioseq_tok=bioseq_tok)

Load data time: 0.002 s


In [24]:
ds[1]

e archaeon Pyrococcus furiosus contains two thioredoxin fold units."; "Glutaredoxin-like protein gen 100
QLKQLVQELSELTDKLSYEIVDFDTPEGKELAKRYRIDRAPATTITQDGKDFGVRYFGLPAGHEFAAFLEDIVDVSREETNLMDETKQAIRNIDQDVRIL 100


(tensor([[  324, 10121,   525, 39762, 23036,  2736,   665, 15681,  3036, 12844,
           1237,  2362,  3580,   639, 41517, 11021, 10394, 18650,   282,   257,
           5952,   648,   515,   639, 41517,   268,   789,  8088,  3278,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[ True,  True,  True,  True,

# Dataloader

In [25]:
dl = DataLoader(ds, 32)

In [26]:
batch = next(iter(dl))

t."; -!- SUBCELLULAR LOCATION: Membrane . TRANSMEM 12..33 /note="Helical" /evidence="ECO:0000256|SAM 100
MVEALISDEAFIKGCAAIGAGLAVGLAGIGAGAGESGIGAAAVGAIAEDRGFLGLGLLFTVIPETIVIFGLVISFILMFA 80
uriosus."; "A protein disulfide oxidoreductase from the archaeon Pyrococcus furiosus contains two th 100
DETKQAIRNIDQDVRILVFVTPTCPYCPLAVRMAHKFAIENTKAGKGKILGDMVEAIEYPEWADQYNVMAVPKIVIQVNGEDRVEFEGAYPEKMFLEKLL 100
H Unreviewed; 32 AA. A0A7C3Z3T6; SubName: Full=DUF2080 family transposase-associated protein ; Flags 100
YERTITPFGNSAKLDAPKKYIGHRAYVIIVK 31
 ACT_SITE 372 /note="Nucleophile" /evidence="ECO:0000256|PROSITE-ProRule:PRU10055" 1: Evidence at pr 100
PKNFMFGYSWSGFQFEMGLPGSEVESDWWVWVHDKENIASGLVSGDLPENGPAYWHLYKQDHDIAEKLGMDCIRGGIEWARIFPKPTFDVKVDVEKDEEG 100
donado J.Q.; Lopez-Cortes A.; "Methane production in hypersaline environments: Microbial diversity;  100
YATAAYTNNILDDNLYYNVDYINDKYDGAANKGTDNKVNATMDVVKDIATESTIYGIENYEKYPTALEDHFGGSQRATVLSAAAGSAASLATGNANAGLS 100
 N.; Fullmer M.S.; Makkay A.M.; Wheeler R

In [27]:
[b.shape for b in batch]

[torch.Size([32, 1, 120]),
 torch.Size([32, 1, 120]),
 torch.Size([32, 120]),
 torch.Size([32, 120])]

# RankSplitDataset

For details see notebook `RankSplitDataset.ipynb`.

In [28]:
path_offset_dict = '../data/uniprot_sprot_offset_dict.json'

In [29]:
with open(path_offset_dict, "r", encoding='utf-8') as data_file:    
    offset_dict = json.load(data_file)

In [30]:
len(offset_dict.keys())

564278

In [31]:
file_path = "../data/uniprot_sprot.csv"

In [32]:
class RankSplitDataset(Dataset):
    def __init__(self, file_path, offset_dict, rank, world_size, logger=None):
        self.file_path        = file_path
        self.offset_dict      = offset_dict
        self.total_len        = len(offset_dict.keys())
        self.rank_len         = self.total_len // world_size
        self.rank_line_offset = self.rank_len * rank
        self.rank_byte_offset = self.offset_dict[str(self.rank_line_offset)] # because json keys are strings after it is saved

        if logger:
            logger.info(f"{datetime.now()} rank: {rank} dataset information:\n{'total len':>20}: {self.total_len}\n{'rank len':>20}: {self.rank_len}\n{'rank line offset':>20}: {self.rank_line_offset}\n{'rank byte offset':>20}: {self.rank_byte_offset}")
        else:
            print(f"{datetime.now()} rank: {rank} dataset information:\n{'total len':>20}: {self.total_len}\n{'rank len':>20}: {self.rank_len}\n{'rank line offset':>20}: {self.rank_line_offset}\n{'rank byte offset':>20}: {self.rank_byte_offset}")

        tp = time.time()
        with open(self.file_path, 'r', encoding='utf-8') as f:
            f.seek(self.rank_byte_offset) # move to the line for the specific rank
            lines = []
            for i in range(self.rank_len): # load all the lines for the rank
                line = f.readline()
                if line != "":
                    lines.append(line)

        self.data = lines

        if logger:
            logger.info(f"{datetime.now()} rank: {rank} dataset load data time: {time.time() - tp:.3f} s")
            logger.info(f"{datetime.now()} rank: {rank} dataset len: {len(self.data)}")
        else:
            print(f"{datetime.now()} rank: {rank} dataset load data time: {time.time() - tp:.3f} s")
            print(f"{datetime.now()} rank: {rank} dataset len: {len(self.data)}")
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# CLASPRankSplitDataset

In [33]:
class CLASPRankSplitDataset(RankSplitDataset):
    """
    CLASP rank split dataset that loads equally sized pieces for each rank
    of the preprocessed csv file into RAM.
        path: path to the csv file
    """
    def __init__(self, file_path, offset_dict, rank, world_size, logger,
                 text_sampler, bioseq_sampler, text_tok, bioseq_tok):
        super().__init__(file_path, offset_dict, rank, world_size, logger)

        self.text_sampler   = text_sampler
        self.bioseq_sampler = bioseq_sampler

        self.text_tok   = text_tok
        self.bioseq_tok = bioseq_tok

    def __getitem__(self, idx):
        sample = self.data[idx][:-1] # without "\n"
        sample = sample.split(",")
        sample = [x for x in sample if len(x) > 0]

        text   = " ".join(sample[:-1])
        bioseq = sample[-1]

        text   = self.text_sampler(text)
        bioseq = self.bioseq_sampler(bioseq)
        
        print(text, len(text))
        print(bioseq, len(bioseq))

        text, text_mask = self.text_tok(text)
        bioseq, bioseq_mask = self.bioseq_tok(bioseq)

        return text, text_mask, bioseq, bioseq_mask

In [34]:
str_sampler = partial(basic_rand_sampler, sample_len=100)
text_tok    = partial(tokenize, context_length=120, return_mask=True)
bioseq_tok  = partial(basic_aa_tokenizer, context_length=120, return_mask=True)

In [35]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           31Gi       4.1Gi        22Gi       0.0Ki       4.7Gi        26Gi
Swap:         979Mi       979Mi       0.0Ki


In [36]:
ds1 = CLASPRankSplitDataset(file_path=file_path,
                           offset_dict=offset_dict,
                           rank=0,
                           world_size=2,
                           text_sampler=str_sampler,
                           bioseq_sampler=str_sampler,
                           text_tok=text_tok,
                           bioseq_tok=bioseq_tok,
                           logger=None)

2021-04-18 17:47:04.445135 rank: 0 dataset information:
           total len: 564278
            rank len: 282139
    rank line offset: 0
    rank byte offset: 118
2021-04-18 17:47:05.894728 rank: 0 dataset load data time: 1.449 s
2021-04-18 17:47:05.894806 rank: 0 dataset len: 282139


In [37]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           31Gi       5.5Gi        21Gi       0.0Ki       4.7Gi        25Gi
Swap:         979Mi       979Mi       0.0Ki


In [38]:
5.5 - 4.1

1.4000000000000004

In [39]:
ds2 = CLASPRankSplitDataset(file_path=file_path,
                           offset_dict=offset_dict,
                           rank=1,
                           world_size=2,
                           text_sampler=str_sampler,
                           bioseq_sampler=str_sampler,
                           text_tok=text_tok,
                           bioseq_tok=bioseq_tok,
                           logger=None)

2021-04-18 17:47:11.662694 rank: 1 dataset information:
           total len: 564278
            rank len: 282139
    rank line offset: 282139
    rank byte offset: 1520906478
2021-04-18 17:47:12.889981 rank: 1 dataset load data time: 1.227 s
2021-04-18 17:47:12.890055 rank: 1 dataset len: 282138


In [40]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           31Gi       6.7Gi        19Gi       0.0Ki       4.7Gi        24Gi
Swap:         979Mi       979Mi       0.0Ki


In [41]:
6.7 - 5.5

1.2000000000000002

In [42]:
6.7 - 4.1

2.6000000000000005

In [43]:
assert not(torch.equal(ds1[0][0], ds2[0][0]))

 Ranavirus. NCBI_TaxID=654924; NCBI_TaxID=30343; Dryophytes versicolor (chameleon treefrog). NCBI_Ta 100
APVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAF 100
="ECO:0000255|HAMAP-Rule:MF_00445" TRANSMEM 354..374 /note="Helical" /evidence="ECO:0000255|HAMAP-Ru 100
GEIELQEIVNGLINTQMYNSPGISIALIFITVGIGFKLSPAPSHQWTPDVYEGSPTPVVAFLSVTSKVAASASATRIFDIPFYFSSNEWHPLLEILAILS 100


In [44]:
assert not(torch.equal(ds1[0][1], ds2[0][1]))

41. RecName: Full=Putative transcription factor 001R; ORFNames=FV3-001R; Frog virus 3 (isolate Goorh 100
SAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRY 100
valuation and plastome evolution."; Nucleic Acids Res. 36:2366-2378(2008). -!- FUNCTION: NDH shuttle 100
FSSNEWHPLLEILAILSMILGNLIAITQTSMKRMLAYSSIGQIGYVIIGIIVGDANGGYASMITYMLFYISMNLGTFACIVLFGLRTGTDNIRDYAGLYT 100


In [45]:
ds1[0]

lor (chameleon treefrog). NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens) 100
LSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLK 100


(tensor([[11335,   263, 41317, 13354, 11438,  1818,  5021,  3035,   318,  4581,
           1014,   284,   279,   275,   271,   275,   282, 31088, 21727,   741,
            741,  1323,   263,  5049, 15931, 11438,   264,   263, 22143,   741,
            741,  1323,   264,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[ True,  True,  True,  True,

In [48]:
ds2[0]

11-DEC-2019; entry version 24. RecName: Full=NAD(P)H-quinone oxidoreductase subunit 2 B; chloroplast 100
SPGISIALIFITVGIGFKLSPAPSHQWTPDVYEGSPTPVVAFLSVTSKVAASASATRIFDIPFYFSSNEWHPLLEILAILSMILGNLIAITQTSMKRMLA 100


(tensor([[  272,   272,   268,  4628,   268,   273,   271,   272,   280,   282,
           5362,  3273,   273,   275,   269,  1020,  1981,   281,  1476,   284,
          43928,   263,   335,   264,   327,   268,  8607,   637, 33731,   639,
           5015,    83,   894,  1783,  5695,   273,   321,   282, 23135,   676,
            952,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[ True,  True,  True,  True,

# Real parameter dataset setup

In [49]:
text_sampler = partial(basic_rand_sampler, sample_len=1024)
text_tok     = partial(tokenize, context_length=1024, return_mask=True)

bioseq_sampler = partial(basic_rand_sampler, sample_len=512)
bioseq_tok     = partial(basic_aa_tokenizer, context_length=512, return_mask=True)

In [50]:
ds_real = CLASPRankSplitDataset(file_path=file_path,
                           offset_dict=offset_dict,
                           rank=0,
                           world_size=2,
                           text_sampler=text_sampler,
                           bioseq_sampler=bioseq_sampler,
                           text_tok=text_tok,
                           bioseq_tok=bioseq_tok,
                           logger=None)

2021-04-18 17:47:46.644228 rank: 0 dataset information:
           total len: 564278
            rank len: 282139
    rank line offset: 0
    rank byte offset: 118
2021-04-18 17:47:48.073161 rank: 0 dataset load data time: 1.429 s
2021-04-18 17:47:48.073235 rank: 0 dataset len: 282139


## Small sample

In [51]:
# find some smaller length test cases
[(i,len(ds_real.data[i])) for i in range(10000) if len(ds_real.data[i]) < 1024]

[(246, 637),
 (622, 700),
 (629, 926),
 (646, 917),
 (824, 942),
 (1877, 818),
 (2307, 914),
 (2515, 974),
 (2545, 962),
 (2551, 972),
 (3124, 1007)]

In [52]:
idx = 246

In [53]:
ds_real.data[idx]

'12KD_MYCSM Reviewed; 24 AA.,P80438;,01-NOV-1995; integrated into UniProtKB/Swiss-Prot. 01-NOV-1995; sequence version 1. 11-DEC-2019; entry version 23.,RecName: Full=12 kDa protein; Flags: Fragment;,,Mycolicibacterium smegmatis (Mycobacterium smegmatis).,,Bacteria; Actinobacteria; Corynebacteriales; Mycobacteriaceae; Mycolicibacterium.,NCBI_TaxID=1772;,,[1],PROTEIN SEQUENCE.,,,,Pahl A.; Keller U.;,,Submitted (MAR-1995) to UniProtKB.,,,CHAIN 1..>24 /note="12 kDa protein" /id="PRO_0000064349" NON_TER 24,1: Evidence at protein level;,Direct protein sequencing.,SEQUENCE 24 AA; 2766 MW; 0D19F1F488DB3201 CRC64;,MFHVLTLTYLCPLDVVXQTRPAHV\n'

In [54]:
ds_real[idx]

12KD_MYCSM Reviewed; 24 AA. P80438; 01-NOV-1995; integrated into UniProtKB/Swiss-Prot. 01-NOV-1995; sequence version 1. 11-DEC-2019; entry version 23. RecName: Full=12 kDa protein; Flags: Fragment; Mycolicibacterium smegmatis (Mycobacterium smegmatis). Bacteria; Actinobacteria; Corynebacteriales; Mycobacteriaceae; Mycolicibacterium. NCBI_TaxID=1772; [1] PROTEIN SEQUENCE. Pahl A.; Keller U.; Submitted (MAR-1995) to UniProtKB. CHAIN 1..>24 /note="12 kDa protein" /id="PRO_0000064349" NON_TER 24 1: Evidence at protein level; Direct protein sequencing. SEQUENCE 24 AA; 2766 MW; 0D19F1F488DB3201 CRC64; 602
MFHVLTLTYLCPLDVVXQTRPAHV 24


(tensor([[  272,   273, 15597,  ...,     0,     0,     0]]),
 tensor([[ True,  True,  True,  ..., False, False, False]]),
 tensor([10,  4,  6, 19,  9, 17,  9, 17, 21,  9,  1, 13,  9,  2, 19, 19, 22, 14,
         17, 15, 13,  0,  6, 19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,

In [55]:
sample = ds_real.data[idx][:-1].split(","); sample

['12KD_MYCSM Reviewed; 24 AA.',
 'P80438;',
 '01-NOV-1995; integrated into UniProtKB/Swiss-Prot. 01-NOV-1995; sequence version 1. 11-DEC-2019; entry version 23.',
 'RecName: Full=12 kDa protein; Flags: Fragment;',
 '',
 'Mycolicibacterium smegmatis (Mycobacterium smegmatis).',
 '',
 'Bacteria; Actinobacteria; Corynebacteriales; Mycobacteriaceae; Mycolicibacterium.',
 'NCBI_TaxID=1772;',
 '',
 '[1]',
 'PROTEIN SEQUENCE.',
 '',
 '',
 '',
 'Pahl A.; Keller U.;',
 '',
 'Submitted (MAR-1995) to UniProtKB.',
 '',
 '',
 'CHAIN 1..>24 /note="12 kDa protein" /id="PRO_0000064349" NON_TER 24',
 '1: Evidence at protein level;',
 'Direct protein sequencing.',
 'SEQUENCE 24 AA; 2766 MW; 0D19F1F488DB3201 CRC64;',
 'MFHVLTLTYLCPLDVVXQTRPAHV']

In [56]:
" ".join(sample[:-1])

'12KD_MYCSM Reviewed; 24 AA. P80438; 01-NOV-1995; integrated into UniProtKB/Swiss-Prot. 01-NOV-1995; sequence version 1. 11-DEC-2019; entry version 23. RecName: Full=12 kDa protein; Flags: Fragment;  Mycolicibacterium smegmatis (Mycobacterium smegmatis).  Bacteria; Actinobacteria; Corynebacteriales; Mycobacteriaceae; Mycolicibacterium. NCBI_TaxID=1772;  [1] PROTEIN SEQUENCE.    Pahl A.; Keller U.;  Submitted (MAR-1995) to UniProtKB.   CHAIN 1..>24 /note="12 kDa protein" /id="PRO_0000064349" NON_TER 24 1: Evidence at protein level; Direct protein sequencing. SEQUENCE 24 AA; 2766 MW; 0D19F1F488DB3201 CRC64;'

In [57]:
sample[-1]

'MFHVLTLTYLCPLDVVXQTRPAHV'

## Long sample

In [58]:
idx = 0

In [59]:
ds_real.data[idx]

'001R_FRG3G Reviewed; 256 AA.,Q6GZX4;,28-JUN-2011; integrated into UniProtKB/Swiss-Prot. 19-JUL-2004; sequence version 1. 12-AUG-2020; entry version 41.,RecName: Full=Putative transcription factor 001R;,ORFNames=FV3-001R;,Frog virus 3 (isolate Goorha) (FV-3).,,Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota; Megaviricetes; Pimascovirales; Iridoviridae; Alphairidovirinae; Ranavirus.,NCBI_TaxID=654924;,NCBI_TaxID=30343; Dryophytes versicolor (chameleon treefrog). NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens). NCBI_TaxID=45438; Lithobates sylvaticus (Wood frog) (Rana sylvatica). NCBI_TaxID=8316; Notophthalmus viridescens (Eastern newt) (Triturus viridescens).,[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,,PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;,,Tan W.G.; Barkman T.J.; Gregory Chinchar V.; Essani K.;,"Comparative genomic analyses of frog virus 3; type species of the genus Ranavirus (family Iridoviridae).";,Virology 323:70-84(2004).,-!- F

In [60]:
ds_real[idx]

Prot. 19-JUL-2004; sequence version 1. 12-AUG-2020; entry version 41. RecName: Full=Putative transcription factor 001R; ORFNames=FV3-001R; Frog virus 3 (isolate Goorha) (FV-3). Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota; Megaviricetes; Pimascovirales; Iridoviridae; Alphairidovirinae; Ranavirus. NCBI_TaxID=654924; NCBI_TaxID=30343; Dryophytes versicolor (chameleon treefrog). NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens). NCBI_TaxID=45438; Lithobates sylvaticus (Wood frog) (Rana sylvatica). NCBI_TaxID=8316; Notophthalmus viridescens (Eastern newt) (Triturus viridescens). [1] NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]. PubMed=15165820; DOI=10.1016/j.virol.2004.02.019; Tan W.G.; Barkman T.J.; Gregory Chinchar V.; Essani K.; "Comparative genomic analyses of frog virus 3; type species of the genus Ranavirus (family Iridoviridae)."; Virology 323:70-84(2004). -!- FUNCTION: Transcription activation. {ECO:0000305}. EMBL; AY548484; AAT09660.1; -; Genom

(tensor([[644, 339, 269,  ...,   0,   0,   0]]),
 tensor([[ True,  True,  True,  ..., False, False, False]]),
 tensor([10,  0,  4, 16,  0,  3,  2, 19,  9,  8,  3, 21,  2, 15, 15, 15, 15, 10,
          3,  0,  9,  9,  9, 16,  9, 21, 21, 13, 11,  2, 15,  8,  9,  9,  2, 21,
          8,  3, 20, 16, 13, 13, 15, 19, 14, 19,  3,  1, 13,  8,  0, 13, 19,  3,
         20, 11, 11, 13, 13, 16,  3,  8,  5,  9,  7, 19,  5,  6,  4, 16,  5,  7,
          8, 21,  8,  5,  3,  8,  0, 14,  0, 16,  3, 19,  2, 19, 11,  8, 10,  1,
          1, 20, 19, 16,  8,  4,  8,  2,  0, 10, 15, 15, 21, 14,  5,  7, 14, 17,
          1,  8,  7, 13,  5,  8, 19,  9, 16,  2,  9,  2,  0,  8,  7,  8,  0, 21,
         11,  9, 17, 19,  3,  5, 19,  3,  5,  4, 19, 15, 21, 16, 15, 19, 17,  8,
         14,  6, 19,  0,  0,  4,  9,  8,  3,  9, 15,  6, 16,  8, 14, 21,  3, 11,
         19, 11,  9,  7,  6, 21,  7,  9, 17,  2,  8, 15, 19,  2,  7, 14,  6,  9,
          3,  8,  2,  9, 19,  8,  2,  4,  8,  0,  9, 19,  3, 16,  0,  6, 15, 10,

In [61]:
sample = ds_real.data[idx][:-1].split(","); sample

['001R_FRG3G Reviewed; 256 AA.',
 'Q6GZX4;',
 '28-JUN-2011; integrated into UniProtKB/Swiss-Prot. 19-JUL-2004; sequence version 1. 12-AUG-2020; entry version 41.',
 'RecName: Full=Putative transcription factor 001R;',
 'ORFNames=FV3-001R;',
 'Frog virus 3 (isolate Goorha) (FV-3).',
 '',
 'Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota; Megaviricetes; Pimascovirales; Iridoviridae; Alphairidovirinae; Ranavirus.',
 'NCBI_TaxID=654924;',
 'NCBI_TaxID=30343; Dryophytes versicolor (chameleon treefrog). NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens). NCBI_TaxID=45438; Lithobates sylvaticus (Wood frog) (Rana sylvatica). NCBI_TaxID=8316; Notophthalmus viridescens (Eastern newt) (Triturus viridescens).',
 '[1]',
 'NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].',
 '',
 'PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;',
 '',
 'Tan W.G.; Barkman T.J.; Gregory Chinchar V.; Essani K.;',
 '"Comparative genomic analyses of frog virus 3; type species of the genus R

In [62]:
" ".join(sample[:-1])

'001R_FRG3G Reviewed; 256 AA. Q6GZX4; 28-JUN-2011; integrated into UniProtKB/Swiss-Prot. 19-JUL-2004; sequence version 1. 12-AUG-2020; entry version 41. RecName: Full=Putative transcription factor 001R; ORFNames=FV3-001R; Frog virus 3 (isolate Goorha) (FV-3).  Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota; Megaviricetes; Pimascovirales; Iridoviridae; Alphairidovirinae; Ranavirus. NCBI_TaxID=654924; NCBI_TaxID=30343; Dryophytes versicolor (chameleon treefrog). NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens). NCBI_TaxID=45438; Lithobates sylvaticus (Wood frog) (Rana sylvatica). NCBI_TaxID=8316; Notophthalmus viridescens (Eastern newt) (Triturus viridescens). [1] NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].  PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;  Tan W.G.; Barkman T.J.; Gregory Chinchar V.; Essani K.; "Comparative genomic analyses of frog virus 3; type species of the genus Ranavirus (family Iridoviridae)."; Virology 323:70-84(2004). -!- F

In [63]:
sample[-1]

'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL'

# End