# Introduction

Idea based on https://github.com/pytorch/text/issues/130#issuecomment-510412877.

# Setup 

In [24]:
import json
import time
from datetime import datetime
import torch
from torch.utils.data import Dataset, DataLoader

# Paths 

In [4]:
path_small = "../data/uniprot_sprot.csv"
path_all   = "../data/uniprot.csv"

# Offset dict

https://github.com/pytorch/text/issues/130#issuecomment-510412877

```
!wc ../data/uniprot_sprot.csv -l
564278 ../data/uniprot_sprot.csv

!wc ../data/uniprot.csv -l
208365011 ../data/uniprot.csv
```

In [5]:
large_file_path = path_small
number_of_lines = 564278

In [6]:
#large_file_path = path_all
#number_of_lines = 208365011

## Create offset dict

In [7]:
offset_dict = {}
with open(large_file_path, 'rb') as f:
    f.readline()  # move over header
    for line in range(number_of_lines):
        offset = f.tell()
        offset_dict[line] = offset
        f.readline()

In [8]:
offset_dict

{0: 118,
 1: 1977,
 2: 4044,
 3: 5922,
 4: 7482,
 5: 9290,
 6: 10987,
 7: 12719,
 8: 14292,
 9: 16726,
 10: 18110,
 11: 19535,
 12: 21443,
 13: 23194,
 14: 26382,
 15: 28148,
 16: 29942,
 17: 32167,
 18: 34033,
 19: 35700,
 20: 37561,
 21: 38922,
 22: 40388,
 23: 42074,
 24: 43924,
 25: 45372,
 26: 49396,
 27: 51274,
 28: 52847,
 29: 55101,
 30: 56780,
 31: 58290,
 32: 60047,
 33: 61724,
 34: 63387,
 35: 65871,
 36: 67412,
 37: 69222,
 38: 70630,
 39: 72424,
 40: 74045,
 41: 75388,
 42: 76824,
 43: 78962,
 44: 80645,
 45: 82464,
 46: 83832,
 47: 85579,
 48: 87632,
 49: 89116,
 50: 90620,
 51: 92637,
 52: 94523,
 53: 95936,
 54: 97582,
 55: 99081,
 56: 100879,
 57: 103436,
 58: 104814,
 59: 106563,
 60: 108287,
 61: 110183,
 62: 112986,
 63: 114340,
 64: 116009,
 65: 117515,
 66: 119006,
 67: 120380,
 68: 121815,
 69: 123191,
 70: 125038,
 71: 127249,
 72: 129587,
 73: 131183,
 74: 133129,
 75: 135169,
 76: 136731,
 77: 139620,
 78: 140989,
 79: 142718,
 80: 145103,
 81: 146858,
 82: 14

In [None]:
path_offset_dict = f"{large_file_path.split('.csv')[0]}_offset_dict.json"; path_offset_dict

## Save offset dict

In [10]:
with open(path_offset_dict, 'w', encoding='utf-8') as f:
    json.dump(offset_dict, f, ensure_ascii=False, indent=4)

## Load offset dict

In [None]:
path_offset_dict = f"{large_file_path.split('.csv')[0]}_offset_dict.json"; path_offset_dict

In [12]:
with open(path_offset_dict, "r", encoding='utf-8') as data_file:    
    offset_dict = json.load(data_file)

In [13]:
offset_dict

{'0': 118,
 '1': 1977,
 '2': 4044,
 '3': 5922,
 '4': 7482,
 '5': 9290,
 '6': 10987,
 '7': 12719,
 '8': 14292,
 '9': 16726,
 '10': 18110,
 '11': 19535,
 '12': 21443,
 '13': 23194,
 '14': 26382,
 '15': 28148,
 '16': 29942,
 '17': 32167,
 '18': 34033,
 '19': 35700,
 '20': 37561,
 '21': 38922,
 '22': 40388,
 '23': 42074,
 '24': 43924,
 '25': 45372,
 '26': 49396,
 '27': 51274,
 '28': 52847,
 '29': 55101,
 '30': 56780,
 '31': 58290,
 '32': 60047,
 '33': 61724,
 '34': 63387,
 '35': 65871,
 '36': 67412,
 '37': 69222,
 '38': 70630,
 '39': 72424,
 '40': 74045,
 '41': 75388,
 '42': 76824,
 '43': 78962,
 '44': 80645,
 '45': 82464,
 '46': 83832,
 '47': 85579,
 '48': 87632,
 '49': 89116,
 '50': 90620,
 '51': 92637,
 '52': 94523,
 '53': 95936,
 '54': 97582,
 '55': 99081,
 '56': 100879,
 '57': 103436,
 '58': 104814,
 '59': 106563,
 '60': 108287,
 '61': 110183,
 '62': 112986,
 '63': 114340,
 '64': 116009,
 '65': 117515,
 '66': 119006,
 '67': 120380,
 '68': 121815,
 '69': 123191,
 '70': 125038,
 '71': 1

In [14]:
offset_dict["0"]

118

# Single line dataset

In [15]:
class SingleLineDataset(Dataset):
    def __init__(self, large_file_path, offset_dict):
        self.large_file_path = large_file_path
        self.offset_dict = offset_dict
    
    def __len__(self):
        return len(self.offset_dict)
    
    def __getitem__(self, line):
        line = str(line) # because json keys are strings after it is saved
        offset = self.offset_dict[line]
        with open(self.large_file_path, 'r', encoding='utf-8') as f:
            f.seek(offset)
            line = f.readline()
            return line

In [16]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           31Gi       4.1Gi        22Gi       0.0Ki       4.6Gi        26Gi
Swap:         979Mi       979Mi       0.0Ki


In [17]:
ds = SingleLineDataset(large_file_path, offset_dict)

In [18]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           31Gi       4.1Gi        22Gi       0.0Ki       4.6Gi        26Gi
Swap:         979Mi       979Mi       0.0Ki


In [19]:
ds[50999]

'CAHS1_PARRC Reviewed; 229 AA.,P0CU51;,07-JUN-2017; integrated into UniProtKB/Swiss-Prot. 07-JUN-2017; sequence version 1. 11-DEC-2019; entry version 5.,RecName: Full=Cytosolic-abundant heat soluble protein 107838 {ECO:0000303|PubMed:28306513}; Short=CAHS 107838 {ECO:0000303|PubMed:28306513}; AltName: Full=Tardigrade-specific intrinsically disordered protein CAHS 107838 {ECO:0000303|PubMed:28306513}; Short=TDP CAHS 107838 {ECO:0000303|PubMed:28306513};,Name=CAHS 107838 {ECO:0000303|PubMed:28306513};,Paramacrobiotus richtersi (Water bear) (Macrobiotus richtersi).,,Eukaryota; Metazoa; Ecdysozoa; Tardigrada; Eutardigrada; Parachela; Macrobiotoidea; Macrobiotidae; Paramacrobiotus; Paramacrobiotus richtersi group.,NCBI_TaxID=697321;,,[1],FUNCTION.,,PubMed=28306513; DOI=10.1016/j.molcel.2017.02.018;,,Boothby T.C.; Tapia H.; Brozena A.H.; Piszkiewicz S.; Smith A.E.; Giovannini I.; Rebecchi L.; Pielak G.J.; Koshland D.; Goldstein B.;,"Tardigrades use intrinsically disordered proteins to surviv

In [20]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           31Gi       4.1Gi        22Gi       0.0Ki       4.6Gi        26Gi
Swap:         979Mi       979Mi       0.0Ki


# RankSplitDataset

In [25]:
class RankSplitDataset(Dataset):
    def __init__(self, file_path, offset_dict, rank, world_size, logger=None):
        self.file_path        = file_path
        self.offset_dict      = offset_dict
        self.total_len        = len(offset_dict.keys())
        self.rank_len         = self.total_len // world_size
        self.rank_line_offset = self.rank_len * rank
        self.rank_byte_offset = self.offset_dict[str(self.rank_line_offset)] # because json keys are strings after it is saved

        if logger:
            logger.info(f"{datetime.now()} rank: {rank} dataset information:\n{'total len':>20}: {self.total_len}\n{'rank len':>20}: {self.rank_len}\n{'rank line offset':>20}: {self.rank_line_offset}\n{'rank byte offset':>20}: {self.rank_byte_offset}")
        else:
            print(f"{datetime.now()} rank: {rank} dataset information:\n{'total len':>20}: {self.total_len}\n{'rank len':>20}: {self.rank_len}\n{'rank line offset':>20}: {self.rank_line_offset}\n{'rank byte offset':>20}: {self.rank_byte_offset}")

        tp = time.time()
        with open(self.file_path, 'r', encoding='utf-8') as f:
            f.seek(self.rank_byte_offset) # move to the line for the specific rank
            lines = []
            for i in range(self.rank_len): # load all the lines for the rank
                line = f.readline()
                if line != "":
                    lines.append(line)

        self.data = lines

        if logger:
            logger.info(f"{datetime.now()} rank: {rank} dataset load data time: {time.time() - tp:.3f} s")
            logger.info(f"{datetime.now()} rank: {rank} dataset len: {len(self.data)}")
        else:
            print(f"{datetime.now()} rank: {rank} dataset load data time: {time.time() - tp:.3f} s")
            print(f"{datetime.now()} rank: {rank} dataset len: {len(self.data)}")
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [26]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           31Gi       4.1Gi        22Gi       0.0Ki       4.6Gi        26Gi
Swap:         979Mi       979Mi       0.0Ki


In [27]:
ds1 = RankSplitDataset(large_file_path, offset_dict, rank=0, world_size=2)

2021-04-18 17:22:19.379192 rank: 0 dataset information:
           total len: 564278
            rank len: 282139
    rank line offset: 0
    rank byte offset: 118
2021-04-18 17:22:20.843714 rank: 0 dataset load data time: 1.464 s
2021-04-18 17:22:20.843795 rank: 0 dataset len: 282139


In [28]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           31Gi       5.6Gi        20Gi       0.0Ki       4.6Gi        25Gi
Swap:         979Mi       979Mi       0.0Ki


In [29]:
5.6 - 4.1

1.5

In [30]:
ds2 = RankSplitDataset(large_file_path, offset_dict, rank=1, world_size=2)

2021-04-18 17:22:33.657333 rank: 1 dataset information:
           total len: 564278
            rank len: 282139
    rank line offset: 282139
    rank byte offset: 1520906478
2021-04-18 17:22:34.898978 rank: 1 dataset load data time: 1.241 s
2021-04-18 17:22:34.899053 rank: 1 dataset len: 282138


In [31]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           31Gi       6.8Gi        19Gi       0.0Ki       4.6Gi        23Gi
Swap:         979Mi       979Mi       0.0Ki


In [32]:
6.8 - 5.6

1.2000000000000002

In [33]:
6.8 - 4.1

2.7

In [34]:
ds1[0] == ds2[0]

False

In [35]:
ds1[0]

'001R_FRG3G Reviewed; 256 AA.,Q6GZX4;,28-JUN-2011; integrated into UniProtKB/Swiss-Prot. 19-JUL-2004; sequence version 1. 12-AUG-2020; entry version 41.,RecName: Full=Putative transcription factor 001R;,ORFNames=FV3-001R;,Frog virus 3 (isolate Goorha) (FV-3).,,Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota; Megaviricetes; Pimascovirales; Iridoviridae; Alphairidovirinae; Ranavirus.,NCBI_TaxID=654924;,NCBI_TaxID=30343; Dryophytes versicolor (chameleon treefrog). NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens). NCBI_TaxID=45438; Lithobates sylvaticus (Wood frog) (Rana sylvatica). NCBI_TaxID=8316; Notophthalmus viridescens (Eastern newt) (Triturus viridescens).,[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,,PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;,,Tan W.G.; Barkman T.J.; Gregory Chinchar V.; Essani K.;,"Comparative genomic analyses of frog virus 3; type species of the genus Ranavirus (family Iridoviridae).";,Virology 323:70-84(2004).,-!- F

In [36]:
ds2[0]

'NU2C2_OENPA Reviewed; 510 AA.,P0CD15; B0Z5H2;,09-FEB-2010; integrated into UniProtKB/Swiss-Prot. 09-FEB-2010; sequence version 1. 11-DEC-2019; entry version 24.,RecName: Full=NAD(P)H-quinone oxidoreductase subunit 2 B; chloroplastic {ECO:0000255|HAMAP-Rule:MF_00445}; EC=7.1.1.- {ECO:0000255|HAMAP-Rule:MF_00445}; AltName: Full=NAD(P)H dehydrogenase; subunit 2 B {ECO:0000255|HAMAP-Rule:MF_00445}; AltName: Full=NADH-plastoquinone oxidoreductase subunit 2 B {ECO:0000255|HAMAP-Rule:MF_00445};,Name=ndhB2 {ECO:0000255|HAMAP-Rule:MF_00445};,Oenothera parviflora (Small-flowered evening primrose) (Oenothera cruciata).,Plastid; Chloroplast.,Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; Spermatophyta; Magnoliopsida; eudicotyledons; Gunneridae; Pentapetalae; rosids; malvids; Myrtales; Onagraceae; Onagroideae; Onagreae; Oenothera.,NCBI_TaxID=482429;,,[1],NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].,STRAIN=cv. Atrovirens;,PubMed=18299283; DOI=10.1093/nar/gkn081;,,Greiner S.; W

In [37]:
len(ds2)

282138

In [39]:
ds2[282138-1]

'Z_WWAVU Reviewed; 95 AA.,B2ZDY1;,20-JAN-2009; integrated into UniProtKB/Swiss-Prot. 01-JUL-2008; sequence version 1. 10-FEB-2021; entry version 53.,RecName: Full=RING finger protein Z {ECO:0000255|HAMAP-Rule:MF_04087}; Short=Protein Z {ECO:0000255|HAMAP-Rule:MF_04087}; AltName: Full=Zinc-binding protein {ECO:0000255|HAMAP-Rule:MF_04087};,Name=Z {ECO:0000255|HAMAP-Rule:MF_04087};,Whitewater Arroyo mammarenavirus (isolate Rat/United States/AV 9310135/1995) (WWAV).,,Viruses; Riboviria; Orthornavirae; Negarnaviricota; Polyploviricotina; Ellioviricetes; Bunyavirales; Arenaviridae; Mammarenavirus.,NCBI_TaxID=46919;,NCBI_TaxID=42407; Neotoma (wood rats).,[1],NUCLEOTIDE SEQUENCE [GENOMIC RNA].,,PubMed=18602020; DOI=10.1016/j.mib.2008.06.001;,,Charrel R.N.; de Lamballerie X.; Emonet S.;,"Phylogeny of the genus Arenavirus.";,Curr. Opin. Microbiol. 11:362-368(2008).,-!- FUNCTION: Plays a crucial role in virion assembly and budding. Expressed late in the virus life cycle; it acts as an inhibitor 

# Load everything to RAM for comparison

In [41]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           31Gi       6.8Gi        19Gi       0.0Ki       4.6Gi        23Gi
Swap:         979Mi       979Mi       0.0Ki


In [42]:
with open(large_file_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

In [43]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           31Gi       9.5Gi        17Gi       0.0Ki       4.6Gi        21Gi
Swap:         979Mi       979Mi       0.0Ki


In [45]:
9.5 - 6.8

2.7

In [None]:
# 2.7G Apr  6 15:42 uniprot_sprot.csv