In [1]:
%%bash

# データのダウンロード

mkdir dataset
curl -Ss https://www.rondhuit.com/download/livedoor-news-data.tar.gz > dataset/dataset.tar.gz
cd dataset
tar -xvf dataset.tar.gz
rm dataset.tar.gz
cd ../


# ツールのインストール

pip install tensorflow_text

dokujo-tsushin.xml
it-life-hack.xml
kaden-channel.xml
livedoor-homme.xml
movie-enter.xml
peachy.xml
smax.xml
sports-watch.xml
topic-news.xml
Collecting tensorflow_text
  Downloading https://files.pythonhosted.org/packages/55/b8/5884204f7c2da639a3061fe3a0c41a06bb80bf7976fa7d407e1d628e38e9/tensorflow_text-2.4.2-cp36-cp36m-manylinux1_x86_64.whl (3.4MB)
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.4.2


In [2]:
'''
-----------------------------------------------------
データの前処理（今回は単語分割が不要です）
-----------------------------------------------------
'''

# 1. XMLからのテキスト抽出

import glob
import xml.etree.ElementTree as ET

def get_data(file_name, target):
    data = list()
    tree = ET.parse(file_name)
    for doc in tree.getroot():
        for element in doc:
            if element.attrib["name"] == target:
                data.append(element.text)
    return data

titles, labels = list(), list()
for file_name in sorted(glob.glob("dataset/*.xml")):
    titles.extend(get_data(file_name, target="title"))
    labels.extend(get_data(file_name, target="cat"))


# 2. 訓練用／検証用／評価用に分割

import numpy as np
np.random.seed(seed=42)

def shuffle(list1, list2):
    tmp = list(zip(list1, list2))
    np.random.shuffle(tmp)
    list1, list2 = zip(*tmp)
    return list(list1), list(list2)

texts, labels = shuffle(titles, labels)
texts_train, labels_train = texts[:5000], labels[:5000]
texts_dev, labels_dev = texts[5000:6000], labels[5000:6000]
texts_test, labels_test = texts[6000:7000], labels[6000:7000]


# タイトルとカテゴリの確認

print("カテゴリ: %s" % labels_train[0])
print("タイトル: %s\n" % texts_train[0])
print("カテゴリ: %s" % labels_dev[0])
print("タイトル: %s\n" % texts_dev[0])
print("カテゴリ: %s" % labels_test[0])
print("タイトル: %s\n" % texts_test[0])

カテゴリ: sports-watch
タイトル: 猫ひろしの五輪出場暗転、為末は「陸上選手の大半の反応は、え、それをアウトにしちゃうの？」

カテゴリ: smax
タイトル: ソフトバンク、史上最速下り最大110Mbpsに対応したモバイルWi-Fiルーター「ULTRA WiFi 4G 102HW」を発表！SoftBank 4GおよびULTRA SPEEDに対応

カテゴリ: topic-news
タイトル: 「ZIP!」スタッフに東野幸治が怒り「謝罪を求めます。」



In [3]:
'''
-----------------------------------------------------
Universal Sentence Encoder
-----------------------------------------------------
'''

import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer

use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

X_train = use(texts_train)
X_dev = use(texts_dev)
X_test = use(texts_test)

print(len(X_train), len(X_train[0]), X_train[0])

5000 512 tf.Tensor(
[ 5.29984906e-02  5.07583991e-02  2.42145490e-02 -1.37642520e-02
  6.29667491e-02 -3.61849391e-03 -4.06696834e-02  1.33339046e-02
 -7.93643594e-02  4.04781960e-02 -6.90994039e-02 -9.72878709e-02
 -8.67387429e-02  2.55118366e-02 -2.91793849e-02  1.31003931e-01
  2.28029359e-02 -1.36984840e-01 -3.84882465e-02 -2.63786204e-02
 -5.62103949e-02 -4.08302508e-02  7.11401254e-02  5.81779564e-03
 -6.99440390e-02 -5.65999523e-02 -2.23118272e-02  6.40609562e-02
 -6.47174045e-02  2.77462229e-02  5.72719499e-02  1.59521308e-02
 -1.98397841e-02 -1.20235886e-02 -1.14549510e-02  4.99239676e-02
  2.68899743e-02  3.83476615e-02 -7.33915493e-02  3.16035971e-02
 -4.73614559e-02 -1.83715839e-02  3.62400524e-02 -3.36119607e-02
  4.58258800e-02  6.80052629e-03 -4.44902182e-02 -3.79182212e-02
  2.24774815e-02 -3.17679569e-02 -4.65216860e-02 -1.37172462e-02
  4.73175421e-02 -8.06091428e-02  4.33726721e-02  2.81160977e-02
 -5.28205670e-02  1.08875921e-02 -6.00579008e-02 -9.62352753e-02
  5.7

In [4]:
'''
-----------------------------------------------------
分類器の訓練
-----------------------------------------------------
'''

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ラベルをIDに変換

label2id = dict()
for label in sorted(set(labels)):
    label2id[label] = len(label2id)
y_train = [label2id[label] for label in labels_train]
y_dev = [label2id[label] for label in labels_dev]
y_test = [label2id[label] for label in labels_test]


# 分類器の訓練

best_c, best_score = 0, 0
for c in [0.1, 1, 10]:
    classifier = LogisticRegression(C=c, max_iter=1000)
    classifier.fit(X_train, y_train)
    dev_acc = accuracy_score(y_dev, classifier.predict(X_dev))
    if best_score < dev_acc:
        best_score = dev_acc
        best_c = c
    print("Dev accuracy = %1.3f\tC = %s" % (dev_acc, str(c)))
print("Best parameter: C = %s" % str(best_c))

Dev accuracy = 0.752	C = 0.1
Dev accuracy = 0.794	C = 1
Dev accuracy = 0.797	C = 10
Best parameter: C = 10


In [5]:
'''
-----------------------------------------------------
評価
-----------------------------------------------------
'''

classifier = LogisticRegression(C=best_c, max_iter=1000)
classifier.fit(X_train, y_train)
test_acc = accuracy_score(y_test, classifier.predict(X_test))
print("Test accuracy = %1.3f" % test_acc)

Test accuracy = 0.808
