In [1]:
import sys
sys.path.append('../')

In [2]:
import os
import pickle
from copy import deepcopy
from torch.utils.data import DataLoader
from utils.datasets import LCQMCDataset

In [3]:
DATA_PATH = '../data/'
LCQMC_PATH = os.path.join(DATA_PATH, 'LCQMC')
WORD_VECTORS_PATH = os.path.join(DATA_PATH, 'word_vectors')
BAIDUBAIKE_PKL = os.path.join(WORD_VECTORS_PATH, 'baidubaike.pkl')

MAX_SEQ_LEN = 40

In [4]:
with open(BAIDUBAIKE_PKL, 'rb') as f:
    wvs = pickle.load(f)

In [5]:
w2i = wvs['wi']

dataset = LCQMCDataset(LCQMC_PATH, MAX_SEQ_LEN, w2i, charmode = True)

In [6]:
dataset.to('train')

In [7]:
len(dataset)

238766

In [8]:
dataset[1]

{'ids1': tensor([   75,  1317,  1956, 11373,    10,    75,   808,  6124,    43,  1317,
          1956,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'ids2': tensor([   75,   808,  4602,    43,   382,  1317,  1956,  3316,  5766, 15544,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'len1': tensor(11),
 'len2': tensor(10),
 'label': tensor(1.)}

In [9]:
dataset.to('dev')

In [10]:
len(dataset)

8802

In [11]:
dataset[1]

{'ids1': tensor([ 1600,  2112,   722,   131,     9,  2459, 14299,   986,   855,  1768,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'ids2': tensor([  799,  2468,    67,  2519,  1678,  1600,  5771,    73,   128,  7964,
             9, 10388, 11774,  2320,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'len1': tensor(10),
 'len2': tensor(14),
 'label': tensor(0.)}

In [12]:
dataset.to('test')

In [13]:
len(dataset)

12500

In [14]:
dataset.to('train')

In [15]:
dataset.to('dev')

In [16]:
dataset.to('test')

# DataLoader测试

In [17]:
def get_loader(dataset, mode):
    _dataset = deepcopy(dataset)
    _dataset.to(mode)
    return DataLoader(_dataset, batch_size=2, shuffle=False)

In [18]:
dev_loader = get_loader(dataset, 'dev')
test_loader = get_loader(dataset, 'test')

In [19]:
dev_iter = iter(dev_loader)
test_iter = iter(test_loader)

In [20]:
dev_batch = next(dev_iter)
test_batch = next(test_iter)

In [21]:
dev_batch

{'ids1': tensor([[ 1125,  1140, 12191,   963,  9205,  3865,  1174, 15402, 11774,  8066,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
         [ 1600,  2112,   722,   131,     9,  2459, 14299,   986,   855,  1768,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 'ids2': tensor([[ 1140, 12191,   963,  9205,  2468, 21232,  3865,  1174, 15402, 11774,
           1125,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
         [  799,  2

In [22]:
test_batch

{'ids1': tensor([[  1600,     21,   9555,     68,     67, 371663, 621167,    899,      3,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0],
         [  3609,   4811,   3296,  12617,  10388,  11774,   3609,   4811,    189,
             459,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0]]),
 'ids2': tensor([[    67, 371663, 621167,    899,    971,   1600,     21,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,   