In [None]:
%%bash

# データのダウンロード

mkdir dataset
curl -Ss https://www.rondhuit.com/download/livedoor-news-data.tar.gz > dataset/dataset.tar.gz
cd dataset
tar -xvf dataset.tar.gz
rm dataset.tar.gz
cd ../


# ツールのダウンロード

apt-get update
apt-get install mecab file swig libmecab-dev mecab-ipadic-utf8
pip install mecab-python3==0.996.5
pip install transformers==2.11.0

dokujo-tsushin.xml
it-life-hack.xml
kaden-channel.xml
livedoor-homme.xml
movie-enter.xml
peachy.xml
smax.xml
sports-watch.xml
topic-news.xml
Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:11 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:12 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:13 http://archive.ubuntu.com/ubuntu bionic-backports InRele

mkdir: cannot create directory ‘dataset’: File exists


In [None]:
'''
-----------------------------------------------------
データの前処理
-----------------------------------------------------
'''

# 1. XMLからのテキスト抽出

import glob
import xml.etree.ElementTree as ET

def get_data(file_name, target):
    data = list()
    tree = ET.parse(file_name)
    for doc in tree.getroot():
        for element in doc:
            if element.attrib["name"] == target:
                data.append(element.text)
    return data

titles, labels = list(), list()
for file_name in sorted(glob.glob("dataset/*.xml")):
    titles.extend(get_data(file_name, target="title"))
    labels.extend(get_data(file_name, target="cat"))


# 2. 単語分割

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
texts = tokenizer.batch_encode_plus(titles, pad_to_max_length=True)["input_ids"]


# 3. 訓練用／検証用／評価用に分割

import numpy as np
np.random.seed(seed=42)

def shuffle(list1, list2):
    tmp = list(zip(list1, list2))
    np.random.shuffle(tmp)
    list1, list2 = zip(*tmp)
    return list(list1), list(list2)

texts, labels = shuffle(texts, labels)
texts_train, labels_train = texts[:5000], labels[:5000]
texts_dev, labels_dev = texts[5000:6000], labels[5000:6000]
texts_test, labels_test = texts[6000:7000], labels[6000:7000]


# タイトルとカテゴリの確認

print("カテゴリ: %s" % labels_train[0])
print("タイトル: %s\n" % texts_train[0])
print("カテゴリ: %s" % labels_dev[0])
print("タイトル: %s\n" % texts_dev[0])
print("カテゴリ: %s" % labels_test[0])
print("タイトル: %s\n" % texts_test[0])


# ラベルをIDに変換

label2id = dict()
for label in sorted(set(labels)):
    label2id[label] = len(label2id)
y_train = [label2id[label] for label in labels_train]
y_dev = [label2id[label] for label in labels_dev]
y_test = [label2id[label] for label in labels_test]

X_train = np.array(texts_train)
X_dev = np.array(texts_dev)
X_test = np.array(texts_test)

カテゴリ: sports-watch
タイトル: [2, 6040, 10508, 5, 8111, 620, 2049, 28885, 6, 1863, 1107, 9, 36, 3124, 408, 5, 3642, 5, 2512, 9, 6, 1723, 6, 218, 11, 5583, 7, 15, 3051, 28489, 5, 7045, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

カテゴリ: smax
タイトル: [2, 8666, 6, 2737, 12350, 5042, 1208, 5994, 236, 20745, 7, 1277, 15, 10, 9307, 9285, 61, 16325, 2851, 92, 36, 649, 28743, 8143, 28577, 9285, 28718, 28535, 57, 383, 9434, 513, 28865, 38, 11, 602, 679, 16545, 28687, 9881, 57, 383, 502, 649, 28743, 8143, 28577, 26830, 7, 1277, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

カテゴリ: topic-news
タイトル: [2, 36, 28043, 3286, 3435, 7, 24476, 2662, 28749, 14, 7423, 36, 9200, 11, 2023, 2610, 8, 38, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]



In [None]:
'''
-----------------------------------------------------
BERT
    1. モデルの定義
    2. モデルの訓練
-----------------------------------------------------
'''

import torch
from torch.optim import AdamW
from transformers import BertModel, get_linear_schedule_with_warmup
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import accuracy_score


# 1. モデルの定義

class Bert(torch.nn.Module):
    def __init__(self, n_classes=2):
        super(Bert, self).__init__()
        self.bert = BertModel.from_pretrained("cl-tohoku/bert-base-japanese")
        self.linear = torch.nn.Linear(768, n_classes)
        self.softmax = torch.nn.Softmax(dim=-1)
        self.ls = torch.nn.LogSoftmax(dim=-1)
        self.nll = torch.nn.NLLLoss()

    def forward(self, x, y):
        attention_mask = torch.tensor(x != 0).type(torch.cuda.LongTensor)
        _, encoded_text = self.bert(input_ids=x, attention_mask=attention_mask)
        logits = self.linear(encoded_text)
        return self.nll(self.ls(logits), y)

    def predict(self, x):
        attention_mask = (x != 0).type(torch.cuda.LongTensor)
        _, encoded_text = self.bert(input_ids=x, attention_mask=attention_mask)
        logits = self.linear(encoded_text)
        probs = self.softmax(logits)
        return probs


# 2. モデルの訓練

def batcher(X_train, y_train, shuffle=False, batch_size=32):
    index = list(range(len(X_train)))
    count = 0
    while count < len(X_train):
        idx = index[count:count+batch_size]
        x = list()
        y = list()
        for i in idx:
            x.append(torch.tensor(X_train[i]).type(torch.cuda.LongTensor))
            y.append(torch.tensor(y_train[i]).type(torch.cuda.LongTensor))
        x = torch.nn.utils.rnn.pad_sequence(x).transpose(0, 1).cuda()
        y = torch.stack(y).type(torch.cuda.LongTensor)
        count += batch_size
        yield x, y

def evaluate(X_test, y_test, model):
    preds = list()
    for x, y in batcher(X_test, y_test):
        pred = model.predict(x)
        preds.extend(pred.argmax(dim=-1).cpu().numpy())
    return accuracy_score(y_test, preds)

model = Bert(n_classes=max(y_train)+1)
model.cuda()
batch_size = 32

optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=(len(X_train) // batch_size + 1) * 1,
    num_training_steps=(len(X_train) // batch_size + 1) * 5
)

for i in range(3):
    # 訓練用データを用いた学習
    for x, y in tqdm(batcher(X_train, y_train, shuffle=True), total=len(X_train) // batch_size + 1):
        loss = model(x, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
    # 検証用データを用いた評価
    with torch.no_grad():
        dev_acc = evaluate(X_dev, y_dev, model)
    print('Epoch {}: Dev Accuracy = {:.3f}'.format(i + 1, dev_acc))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445021143.0, style=ProgressStyle(descri…




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




Epoch 1: Dev Accuracy = 0.737


HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))


Epoch 2: Dev Accuracy = 0.848


HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))


Epoch 3: Dev Accuracy = 0.867


In [None]:
'''
-----------------------------------------------------
評価
-----------------------------------------------------
'''

with torch.no_grad():
    test_acc = evaluate(X_test, y_test, model)
print("Test accuracy = %1.3f" % test_acc)

Test accuracy = 0.881
