In [1]:
import pickle
import numpy as np
from tqdm import tqdm
import json
from NLPUtils.DataModel import Vocabulary

In [2]:
with open('./MLTCDataset/RCV1/data/topic_sorted.json') as f:
    hash_tgt = json.load(f)

In [3]:
train_voc = Vocabulary({"<UNK>":0}, MAXLEN = 200)

In [4]:
# load Training Data
with open('./MLTCDataset/RCV1/data/train.src') as f:
    src_datas = f.read().split('\n')[:-1]
train_x = []
for sentence in tqdm(src_datas, desc='Constructing Voc'):
    sentence = train_voc.addSentence(sentence = sentence)
    idx_data = [train_voc.word2idx[word] for word in sentence]
    while (len(idx_data) < train_voc.MAXLEN):
        idx_data.append(train_voc.word2idx['<UNK>'])
    train_x.append(idx_data)

Constructing Voc: 100%|██████████| 775220/775220 [00:30<00:00, 25449.94it/s]


In [5]:
with open('./MLTCDataset/RCV1/data/train.tgt') as f:
    tgt_datas = f.read().split('\n')[:-1]

In [6]:
# TODO: need to translate to sparse matrix vector
train_y = []
for tgts in tqdm(tgt_datas, desc='transfer tag'):
    tgts = tgts.split()
    tgt_vector = np.zeros(shape=(len(hash_tgt)), dtype=np.float32)
    for tgt in tgts:
        tgt_vector[hash_tgt[tgt]] = 1.
    train_y.append(tgt_vector)

transfer tag: 100%|██████████| 775220/775220 [00:00<00:00, 836476.60it/s]


In [7]:
with open('./MLTCDataset/RCV1/train_voc.data', 'wb') as f :
    pickle.dump(train_voc, f)

In [8]:
train_x = np.array(train_x, dtype=np.int64)
train_y = np.array(train_y, dtype=np.float32)

In [9]:
np.save('./MLTCDataset/RCV1/data/train_x.npy', train_x)
np.save('./MLTCDataset/RCV1/data/train_y.npy', train_y)

In [10]:
# test Xset
with open('./MLTCDataset/RCV1/data/test.src') as f:
    src_datas = f.read().split('\n')[:-1]
test_x = []
for sentence in tqdm(src_datas, desc='test to word index'):
    sentence = sentence.split()[:train_voc.MAXLEN]
    idx_data = []
    for word in sentence:
        if train_voc.has(word):
            idx_data.append(train_voc.word2idx[word])
        else:
            idx_data.append(train_voc.word2idx["<UNK>"])
    while (len(idx_data) < train_voc.MAXLEN):
        idx_data.append(train_voc.word2idx['<UNK>'])
    test_x.append(idx_data)

test to word index: 100%|██████████| 1191/1191 [00:00<00:00, 25047.97it/s]


In [11]:
# test Yset
with open('./MLTCDataset/RCV1/data/test.tgt') as f:
    tgt_datas = f.read().split('\n')[:-1]
test_y = []
for tgts in tqdm(tgt_datas, desc='transfer tag'):
    tgts = tgts.split()
    tgt_vector = np.zeros(shape=(len(hash_tgt)), dtype=np.float32)
    for tgt in tgts:
        tgt_vector[hash_tgt[tgt]] = 1.
    test_y.append(tgt_vector)

transfer tag: 100%|██████████| 1191/1191 [00:00<00:00, 595401.20it/s]


In [12]:
test_x = np.array(test_x, dtype=np.int64)
test_y = np.array(test_y, dtype=np.float32)
np.save('./MLTCDataset/RCV1/data/test_x.npy', test_x)
np.save('./MLTCDataset/RCV1/data/test_y.npy', test_y)

In [13]:
# valid Xset
with open('./MLTCDataset/RCV1/data/valid.src') as f:
    src_datas = f.read().split('\n')[:-1]
valid_x = []
for sentence in tqdm(src_datas, desc='valid to word index'):
    sentence = sentence.split()[:train_voc.MAXLEN]
    idx_data = []
    for word in sentence:
        if train_voc.has(word):
            idx_data.append(train_voc.word2idx[word])
        else:
            idx_data.append(train_voc.word2idx["<UNK>"])
    while (len(idx_data) < train_voc.MAXLEN):
        idx_data.append(train_voc.word2idx['<UNK>'])
    valid_x.append(idx_data)

valid to word index: 100%|██████████| 21510/21510 [00:00<00:00, 23329.74it/s]


In [14]:
# valid Yset
with open('./MLTCDataset/RCV1/data/valid.tgt') as f:
    tgt_datas = f.read().split('\n')[:-1]
valid_y = []
for tgts in tqdm(tgt_datas, desc='transfer tag'):
    tgts = tgts.split()
    tgt_vector = np.zeros(shape=(len(hash_tgt)), dtype=np.float32)
    for tgt in tgts:
        tgt_vector[hash_tgt[tgt]] = 1.
    valid_y.append(tgt_vector)

transfer tag: 100%|██████████| 21510/21510 [00:00<00:00, 660769.45it/s]


In [15]:
valid_x = np.array(valid_x, dtype=np.int64)
valid_y = np.array(valid_y, dtype=np.float32)
np.save('./MLTCDataset/RCV1/data/valid_x.npy', valid_x)
np.save('./MLTCDataset/RCV1/data/valid_y.npy', valid_y)