In [1]:
import torch
import json


import multiprocessing as mp


In [2]:
class FileReader:
    def __init__(self, filename):
        self.fin = open(filename, "r")
        self.line_map = list()             # Map from line index -> file position.
        self.line_map.append(0)
        i = 0
        while self.fin.readline():
            self.line_map.append(self.fin.tell())
            i += 1
            if i % 100_000 == 0:
                print(i)

    def get_line(self, index):
        self.fin.seek(self.line_map[index])
        return self.fin.readline()

In [6]:
file_reader = FileReader('../data/rel_matrix_test.json')
line = file_reader.get_line(3).strip()
matrix = json.loads(line)
new_matrix = [line.split() for line in matrix]

In [14]:
import numpy as np

print(np.array(new_matrix).shape)
np.array(new_matrix)

(9, 9)


array([['<self>', '0.001', '0.002', '0.003', '0.004', '0.005', '0.004',
        '0.003', '0.004'],
       ['1.0', '<self>', '0.001', '0.002', '0.003', '0.004', '0.003',
        '0.002', '0.003'],
       ['2.0', '1.0', '<self>', '0.001', '0.002', '0.003', '0.002',
        '0.001', '0.002'],
       ['3.0', '2.0', '1.0', '<self>', '0.001', '0.002', '0.001',
        '1.001', '1.002'],
       ['4.0', '3.0', '2.0', '1.0', '<self>', '0.001', '1.001', '2.001',
        '2.002'],
       ['5.0', '4.0', '3.0', '2.0', '1.0', '<self>', '2.001', '3.001',
        '3.002'],
       ['4.0', '3.0', '2.0', '1.0', '1.001', '1.002', '<self>', '2.001',
        '2.002'],
       ['3.0', '2.0', '1.0', '1.001', '1.002', '1.003', '1.002',
        '<self>', '0.001'],
       ['4.0', '3.0', '2.0', '2.001', '2.002', '2.003', '2.002', '1.0',
        '<self>']], dtype='<U6')

In [18]:
a = {1:'a', 2:'b'}
if 2 in a:
    print('nice')

In [15]:
new_matrix = [[el if el in ['<self>', '0.001', 3] else 'unk' for el in line.split()] for line in matrix]

In [16]:
import numpy as np

print(np.array(new_matrix).shape)
np.array(new_matrix)

(9, 9)


array([['<self>', '0.001', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk',
        'unk'],
       ['unk', '<self>', '0.001', 'unk', 'unk', 'unk', 'unk', 'unk',
        'unk'],
       ['unk', 'unk', '<self>', '0.001', 'unk', 'unk', 'unk', '0.001',
        'unk'],
       ['unk', 'unk', 'unk', '<self>', '0.001', 'unk', '0.001', 'unk',
        'unk'],
       ['unk', 'unk', 'unk', 'unk', '<self>', '0.001', 'unk', 'unk',
        'unk'],
       ['unk', 'unk', 'unk', 'unk', 'unk', '<self>', 'unk', 'unk', 'unk'],
       ['unk', 'unk', 'unk', 'unk', 'unk', 'unk', '<self>', 'unk', 'unk'],
       ['unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', '<self>',
        '0.001'],
       ['unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', 'unk', '<self>']],
      dtype='<U6')

In [157]:
class RelMatDataset(torch.utils.data.Dataset):
    """Custom competition dataset."""

    
    def __init__(self, relmat_path):
        self.relmat_path = relmat_path
        
        self.file_reader = FileReader(self.relmat_path)
        
        self.fds = []
        self.locks = []
        self.num_fds = 0
        self.global_lock = mp.Lock()

        
    def __len__(self):
        return len(self.file_reader.line_map)
    
    def get_fd(self):
        res = -1
        with self.global_lock:
            for i in range(self.num_fds):
                if not self.locks[i]:
                    res = i
                    break
            if res == -1:
                self.locks.append(False)
                self.fds.append({key:(FileReader(self.filenames[key]) \
                               if self.filenames[key] is not None \
                               else None) for key in self.filenames})
                res = self.num_fds
                self.num_fds += 1
            self.locks[res] = True
        return res
    

    def __getitem__(self, index):
        num_fd = self.get_fd()
        fd = self.fds[num_fd]
        ex = {}
        for key in fd:
            if fd[key] is not None:
                f = fd[key]                   
                line = f.get_line(index).strip()
                if key == "rel_matrix":
                    line = json.loads(line)
                ex[key] = line
            else:
                ex[key] = None
        with self.global_lock:
            self.locks[num_fd] = False
    
    
        '''def __getitem__(self, index):
        line = self.file_reader.get_line(index).strip()
        matrix = json.loads(line)
        new_matrix = [line.split() for line in matrix]
        
        print(new_matrix)
        
        return new_matrix'''
    
    
    
    

self.filenames - список файлов, в которых доставать строчки

get_fd выдает набор вот этих файлридеров из этих файлов

и потом у этих файлридеров запрашивается строка с номером index

In [162]:
my_dataset = RelMatDataset(relmat_path='../data/rel_matrix_test.json')
print('all train+val samples:', len(my_dataset))

all train+val samples: 9987


In [163]:
from torch.utils.data import DataLoader

train_loader = DataLoader(my_dataset, batch_size=1,
                          shuffle=False, num_workers=10)
                          #pin_memory=True)

In [169]:
torch.LongTensor([1, 2, 3]).size()[0]

3

In [164]:
for i, el in enumerate(train_loader):
    #print(el[0][1])
    print('---' * 100)
    if i > 2:
        break

[['<self>', '0.001', '0.002', '0.003', '0.003', '0.004', '0.004', '0.005'], ['1.0', '<self>', '0.001', '0.002', '0.002', '0.003', '0.003', '0.004'], ['2.0', '1.0', '<self>', '0.001', '0.001', '0.002', '0.002', '0.003'], ['3.0', '2.0', '1.0', '<self>', '1.001', '1.002', '1.002', '1.003'], ['3.0', '2.0', '1.0', '1.001', '<self>', '0.001', '0.001', '0.002'], ['4.0', '3.0', '2.0', '2.001', '1.0', '<self>', '1.001', '1.002'], ['4.0', '3.0', '2.0', '2.001', '1.0', '1.001', '<self>', '0.001'], ['5.0', '4.0', '3.0', '3.001', '2.0', '2.001', '1.0', '<self>']]
[['<self>', '0.001', '0.002', '0.003', '0.003', '0.004'], ['1.0', '<self>', '0.001', '0.002', '0.002', '0.003'], ['2.0', '1.0', '<self>', '0.001', '0.001', '0.002'], ['3.0', '2.0', '1.0', '<self>', '1.001', '1.002'], ['3.0', '2.0', '1.0', '1.001', '<self>', '0.001'], ['4.0', '3.0', '2.0', '2.001', '1.0', '<self>']]
[['<self>', '0.001', '0.002', '0.003', '0.003', '0.004', '0.004', '0.005'], ['1.0', '<self>', '0.001', '0.002', '0.002', '0.00

In [42]:
import sys

sys.getsizeof(my_dataset.file_reader.line_map)

NameError: name 'my_dataset' is not defined

In [None]:
~300 мегабайт