In [1]:
import glob
import os
from random import shuffle
import random

def pre_process_data(filepath):
    dataset = []
    dirs = []
    
    for dir in os.listdir(filepath):
        if os.path.isdir(filepath + dir):
            dirs.append(filepath + dir)
            
    for i, dir_path in enumerate(dirs):
        dir_name = dir_path.split('/')[-1]
        label_id = i
        print('label_id: {}, dir_name: {}'.format(label_id, dir_name))
        
        for filename in glob.glob(os.path.join(filepath, dir_name, dir_name + "*.txt")):
            with open(filename, 'r' ,encoding="utf-8") as f:
                #datasets hold sets of tuples such as (label, input text)
                dataset.append((label_id, f.read()))
                
    random.seed(1234)            
    shuffle(dataset)
    
    return dataset

In [2]:
path = "../livedoor_data/text/"

dataset = pre_process_data(path)
dataset[0]

label_id: 0, dir_name: dokujo-tsushin
label_id: 1, dir_name: it-life-hack
label_id: 2, dir_name: kaden-channel
label_id: 3, dir_name: livedoor-homme
label_id: 4, dir_name: movie-enter
label_id: 5, dir_name: peachy
label_id: 6, dir_name: smax
label_id: 7, dir_name: sports-watch
label_id: 8, dir_name: topic-news


(7,
 'http://news.livedoor.com/article/detail/6283739/\n2012-02-16T08:00:00+0900\n「もう仮病使えよ」香川、長友ら招集に  \n14日、日本サッカー協会は29日に行なわれるW杯アジア3次予選・ウズベキスタン戦に向けて香川真司(ドルトムント/ドイツ)、長友佑都(インテル/イタリア)ら海外組14人の招集を求め、所属クラブに協力を要請する文書を送付したと発表した。負傷中の本田圭佑(CSKAモスクワ/ロシア)は含まれないものの、所属クラブへの完全移籍が見送られた宇佐美貴史(バイエルン/ドイツ)、移籍したばかりの家長昭博(蔚山現代/韓国)らの名前もあるという。\n\n しかし、日本はすでに最終予選進出を決めており、この試合は形としては消化試合となる。海外組を含めたフルメンバーを投入する意図は「試合間隔をあけすぎないこと」などが予想されるが、特に海外で好調を維持する香川、長友の招集にはサッカーファンからさまざまな反応が出た。\n\n 「新戦力発掘しないでどうすんだよ」「宇佐美は五輪のほうに呼べよ」「ジーコ解任デモやった奴、出番だぞ」「もう仮病使えよ」「ドルトムントは招集文書破り捨ててOK」といった、海外組の招集に反対する声が高まった一方で、「ここで呼ばなきゃ、6月の最終予選にぶっつけ本番だぞ」「海外組がいるのといないのとじゃスポンサー料が全然違うからな」「いやこれくらいこなせるだろwお前ら過保護w」といった意見も散見された。\n\n■関連リンク\n・香川真司の得点で勝利。チームも香川も好調を維持。\u3000【ボルシア・ドルトムントｖｓレヴァークーゼン】\n・伊紙、長友の必要性を力説し指揮官を酷評「早く起用すべきだった」\n・【加部究コラム】プロの充実がなければ未来は暗い\n')

In [3]:
len(dataset)

7367

In [4]:
import sentencepiece as spm

tokenizer = spm.SentencePieceProcessor()
tokenizer.Load("../wiki_data/wikiextractor/spm.model")

test = tokenizer.EncodeAsIds(dataset[0][1])
print(test)

[6, 522, 268, 268, 334, 163, 317, 317, 2670, 706, 160, 106, 486, 4491, 497, 192, 531, 106, 5232, 317, 4349, 751, 918, 317, 4085, 3281, 1918, 317, 59, 739, 2383, 2552, 317, 2300, 2528, 28, 82, 272, 179, 290, 62, 163, 545, 163, 545, 1915, 290, 65, 545, 1411, 2203, 1738, 1191, 590, 265, 899, 18, 1910, 98, 3, 84, 1293, 97, 1951, 473, 10, 6, 361, 30, 3, 91, 1570, 1177, 7, 759, 126, 167, 40, 2858, 345, 3181, 1779, 32, 357, 2943, 11, 132, 95, 358, 159, 1227, 99, 5443, 1910, 98, 593, 1029, 15, 1172, 44, 80, 1435, 317, 581, 12, 3, 84, 1293, 6552, 949, 15, 269, 1866, 317, 1122, 12, 97, 2743, 606, 361, 478, 1951, 473, 8, 1825, 3, 860, 1074, 10, 2139, 8, 4815, 25, 3257, 8, 789, 530, 17, 16, 4029, 5, 4306, 1426, 73, 74, 6140, 6552, 15, 145, 130, 5026, 4909, 317, 959, 12, 7, 2172, 207, 1153, 3, 860, 1074, 261, 2343, 2758, 9, 198, 789, 342, 2487, 671, 529, 2627, 861, 15, 994, 838, 131, 317, 581, 12, 3, 2758, 17, 4398, 4, 122, 84, 4682, 1987, 15, 7162, 63, 1941, 317, 1716, 12, 97, 3083, 711, 61, 5, 25

In [42]:
def tokenize_and_encode_with_sp(dataset):
    tokenized_and_encoded_data = []
    i = 0
    for sample in dataset:
        i += 1
        tokens_list_for_each_sample = tokenizer.EncodeAsIds(sample[1])[1:]
        tokens_list_for_each_sample.insert(0, 1)
        tokens_list_for_each_sample.append(2)
        print(tokens_list_for_each_sample)
        tokenized_and_encoded_data.append(tokens_list_for_each_sample)
    return tokenized_and_encoded_data

In [43]:
def collect_labels(dataset):
    labels = []
    for sample in dataset:
        labels.append(sample[0])
    
    return labels

In [44]:
tokenized_and_encoded_inputs = tokenize_and_encode_with_sp(dataset)
labels = collect_labels(dataset)

[1, 522, 268, 268, 334, 163, 317, 317, 2670, 706, 160, 106, 486, 4491, 497, 192, 531, 106, 5232, 317, 4349, 751, 918, 317, 4085, 3281, 1918, 317, 59, 739, 2383, 2552, 317, 2300, 2528, 28, 82, 272, 179, 290, 62, 163, 545, 163, 545, 1915, 290, 65, 545, 1411, 2203, 1738, 1191, 590, 265, 899, 18, 1910, 98, 3, 84, 1293, 97, 1951, 473, 10, 6, 361, 30, 3, 91, 1570, 1177, 7, 759, 126, 167, 40, 2858, 345, 3181, 1779, 32, 357, 2943, 11, 132, 95, 358, 159, 1227, 99, 5443, 1910, 98, 593, 1029, 15, 1172, 44, 80, 1435, 317, 581, 12, 3, 84, 1293, 6552, 949, 15, 269, 1866, 317, 1122, 12, 97, 2743, 606, 361, 478, 1951, 473, 8, 1825, 3, 860, 1074, 10, 2139, 8, 4815, 25, 3257, 8, 789, 530, 17, 16, 4029, 5, 4306, 1426, 73, 74, 6140, 6552, 15, 145, 130, 5026, 4909, 317, 959, 12, 7, 2172, 207, 1153, 3, 860, 1074, 261, 2343, 2758, 9, 198, 789, 342, 2487, 671, 529, 2627, 861, 15, 994, 838, 131, 317, 581, 12, 3, 2758, 17, 4398, 4, 122, 84, 4682, 1987, 15, 7162, 63, 1941, 317, 1716, 12, 97, 3083, 711, 61, 5, 25

In [13]:
print('len(tokenized_and_encoded_inputs):', len(tokenized_and_encoded_inputs))
print('len(tokenized_and_encoded_inputs[0]):', len(tokenized_and_encoded_inputs[0]))
print('len(labels):', len(labels))

len(tokenized_and_encoded_inputs): 7367
len(tokenized_and_encoded_inputs[0]): 504
len(labels): 7367


In [33]:
tokenized_and_encoded_inputs[2]

[1,
 522,
 268,
 268,
 334,
 163,
 317,
 317,
 2670,
 706,
 160,
 106,
 486,
 4491,
 497,
 192,
 531,
 106,
 5232,
 317,
 4349,
 751,
 918,
 317,
 4085,
 3281,
 1918,
 317,
 59,
 631,
 3787,
 3437,
 317,
 2300,
 2528,
 37,
 82,
 660,
 179,
 249,
 163,
 545,
 163,
 545,
 1915,
 290,
 65,
 545,
 6,
 759,
 30,
 7,
 38,
 159,
 142,
 204,
 215,
 192,
 947,
 6,
 120,
 3973,
 647,
 852,
 42,
 941,
 8,
 39,
 4070,
 473,
 4,
 617,
 360,
 3471,
 1741,
 814,
 947,
 6,
 300,
 486,
 268,
 1296,
 436,
 192,
 192,
 394,
 4,
 1639,
 382,
 981,
 275,
 20,
 300,
 486,
 268,
 1296,
 436,
 192,
 192,
 394,
 434,
 106,
 2326,
 334,
 635,
 1529,
 964,
 138,
 3032,
 4033,
 1358,
 12,
 8,
 1335,
 25,
 968,
 2838,
 160,
 2764,
 7,
 3,
 37,
 19,
 759,
 30,
 15,
 30,
 12,
 10,
 1341,
 1092,
 225,
 14,
 814,
 101,
 269,
 1866,
 4,
 1876,
 20,
 238,
 368,
 268,
 1337,
 1146,
 116,
 945,
 368,
 1193,
 192,
 413,
 286,
 1148,
 2540,
 6,
 508,
 1027,
 394,
 241,
 436,
 216,
 2300,
 18,
 10,
 2273,
 4,
 2402,
 27,
 8,

In [14]:
split_data = int(len(tokenized_and_encoded_inputs)* 0.8)

x_train = tokenized_and_encoded_inputs[:split_data]
x_test = tokenized_and_encoded_inputs[split_data:]
y_train= labels[:split_data]
y_test = labels[split_data:]

In [15]:
print(x_train[0])

[1, 522, 268, 268, 334, 163, 317, 317, 2670, 706, 160, 106, 486, 4491, 497, 192, 531, 106, 5232, 317, 4349, 751, 918, 317, 4085, 3281, 1918, 317, 59, 739, 2383, 2552, 317, 2300, 2528, 28, 82, 272, 179, 290, 62, 163, 545, 163, 545, 1915, 290, 65, 545, 1411, 2203, 1738, 1191, 590, 265, 899, 18, 1910, 98, 3, 84, 1293, 97, 1951, 473, 10, 6, 361, 30, 3, 91, 1570, 1177, 7, 759, 126, 167, 40, 2858, 345, 3181, 1779, 32, 357, 2943, 11, 132, 95, 358, 159, 1227, 99, 5443, 1910, 98, 593, 1029, 15, 1172, 44, 80, 1435, 317, 581, 12, 3, 84, 1293, 6552, 949, 15, 269, 1866, 317, 1122, 12, 97, 2743, 606, 361, 478, 1951, 473, 8, 1825, 3, 860, 1074, 10, 2139, 8, 4815, 25, 3257, 8, 789, 530, 17, 16, 4029, 5, 4306, 1426, 73, 74, 6140, 6552, 15, 145, 130, 5026, 4909, 317, 959, 12, 7, 2172, 207, 1153, 3, 860, 1074, 261, 2343, 2758, 9, 198, 789, 342, 2487, 671, 529, 2627, 861, 15, 994, 838, 131, 317, 581, 12, 3, 2758, 17, 4398, 4, 122, 84, 4682, 1987, 15, 7162, 63, 1941, 317, 1716, 12, 97, 3083, 711, 61, 5, 25

In [16]:
#To check the maximum input steps among the entire dataset
max = 0
for elem in tokenized_and_encoded_inputs:
    if len(elem) > max:
        max = len(elem)
        
print('max step-length:', max)

max step-length: 7314


In [17]:
#To check the minimus input steps among the entire dataset
min = 7313
for elem in tokenized_and_encoded_inputs:
    if len(elem) < min:
        min = len(elem)
        
print('min step-length:', min)

min step-length: 70


In [18]:
#To check the average input steps among the entire dataset
sum = 0
total_num = len(tokenized_and_encoded_inputs)
for elem in tokenized_and_encoded_inputs:
     sum += len(elem)
        
print('avg step-length:', sum/total_num)

avg step-length: 891.2934708836705


In [19]:
max_len = 716

In [20]:
from tqdm import tqdm

pad_id = 0
def pad_or_truncate_inputs(data, max_len):
    new_data = []
        
    for sample in tqdm(data):
        if len(sample) >= max_len:
            tmp = sample[:max_len]
        else:
            tmp = sample
            num_of_pads_needed = max_len - len(sample)
            for _ in range(num_of_pads_needed):
                tmp.append(pad_id)
                
        new_data.append(tmp)
        
    return new_data

In [21]:
import numpy as np

x_train = pad_or_truncate_inputs(x_train, max_len)
x_test = pad_or_truncate_inputs(x_test, max_len)

x_train = np.array(x_train)
x_test = np.array(x_test)

print('x_train.shape:', x_train.shape)
print('x_test.shape:', x_test.shape)

100%|██████████| 5893/5893 [00:00<00:00, 35868.47it/s]
100%|██████████| 1474/1474 [00:00<00:00, 38000.20it/s]


x_train.shape: (5893, 716)
x_test.shape: (1474, 716)


In [22]:
print('len(x_train[0]):', len(x_train[0]))
print('len(x_test[1]):', len(x_test[1]))
print('len(x_test[2]):', len(x_test[2]))

len(x_train[0]): 716
len(x_test[1]): 716
len(x_test[2]): 716


In [23]:
print(x_train[0])

[   1  522  268  268  334  163  317  317 2670  706  160  106  486 4491
  497  192  531  106 5232  317 4349  751  918  317 4085 3281 1918  317
   59  739 2383 2552  317 2300 2528   28   82  272  179  290   62  163
  545  163  545 1915  290   65  545 1411 2203 1738 1191  590  265  899
   18 1910   98    3   84 1293   97 1951  473   10    6  361   30    3
   91 1570 1177    7  759  126  167   40 2858  345 3181 1779   32  357
 2943   11  132   95  358  159 1227   99 5443 1910   98  593 1029   15
 1172   44   80 1435  317  581   12    3   84 1293 6552  949   15  269
 1866  317 1122   12   97 2743  606  361  478 1951  473    8 1825    3
  860 1074   10 2139    8 4815   25 3257    8  789  530   17   16 4029
    5 4306 1426   73   74 6140 6552   15  145  130 5026 4909  317  959
   12    7 2172  207 1153    3  860 1074  261 2343 2758    9  198  789
  342 2487  671  529 2627  861   15  994  838  131  317  581   12    3
 2758   17 4398    4  122   84 4682 1987   15 7162   63 1941  317 1716
   12 

In [24]:
from keras.utils.np_utils import to_categorical

y_train = np.array(y_train)
y_test = np.array(y_test)
y_train = to_categorical(y_train.astype('int32'), 9)
y_test = to_categorical(y_test.astype('int32'), 9)

print('y_train.shape:', y_train.shape)
print('y_test.shape:', y_test.shape)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


y_train.shape: (5893, 9)
y_test.shape: (1474, 9)


In [25]:
import tensorflow as tf

def save_data_as_tfrecord(X, Y, tfrecord_filename):
    with tf.python_io.TFRecordWriter(tfrecord_filename) as w:
        for x, y in tqdm(zip(X, Y)):
            x = x.reshape(-1)
            features = tf.train.Features(feature = {
                'X': tf.train.Feature(float_list = tf.train.FloatList(value=x)),
                'Y': tf.train.Feature(float_list = tf.train.FloatList(value=y))
            })
            
            example = tf.train.Example(features=features)
            w.write(example.SerializeToString())

In [26]:
%%time
save_data_as_tfrecord(x_train, y_train, 'train_transformer_with_sp.tfrecord')
save_data_as_tfrecord(x_test, y_test, 'test_transformer_with_sp.tfrecord')
print('TFRcord files are created for training and test data.')

5893it [00:18, 310.78it/s]
1474it [00:04, 310.57it/s]

TFRcord files are created for training and test data.
CPU times: user 23.7 s, sys: 136 ms, total: 23.9 s
Wall time: 23.7 s





In [29]:
x_train[3]

array([   1,  522,  268,  268,  334,  163,  317,  317, 2670,  706,  160,
        106,  486, 4491,  497,  192,  531,  106, 5232,  317, 4349,  751,
        918,  317, 4085, 3281, 1918,  317,   59, 1198, 3621, 3200,  317,
       2300, 2528,   65, 2528,   22,  179,   92,  163,  545,  163,  545,
       1915,  290,   65,  545,    6,  529,  445,   10, 4199,  689,  257,
         40, 4626,  317,  529,   42, 4958,  550, 2151, 4968,  150,   79,
       6308, 6050, 3973,  224, 6051, 3494, 3648,    6,  529,   21,  202,
          8, 1518, 4781,   25,    4,    7,    3, 1241,    4, 2148,  849,
       1415,    4, 2418, 1167, 1791,   16,  572,  146,   24, 1595,  572,
         36, 1721,  200, 1763,    5,   20, 5430,   11, 3973,  224,   18,
         71,  242,  392,   51, 1843,   23,    3,  170,  116,  216,  945,
        286,    4,   38, 2092,   14,  393, 2668,   15, 1159,   13,   62,
         19,  594,   30,  844,   62,   19,  759,   30,   12,  976, 1418,
       3842, 2777, 2810,  170,   48,    8,  267, 37