In [1]:
%%bash

# データのダウンロード

mkdir dataset
curl -Ss https://www.rondhuit.com/download/livedoor-news-data.tar.gz > dataset/dataset.tar.gz
cd dataset
tar -xvf dataset.tar.gz
rm dataset.tar.gz
cd ../


# 形態素解析器(MeCab)のインストール

apt-get update
apt-get install mecab file swig libmecab-dev mecab-ipadic-utf8
pip install mecab-python3==0.996.5


# 単語ベクトル(fastText)のダウンロード（3分ぐらい）

pip install gensim
wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz
gunzip cc.ja.300.vec.gz

dokujo-tsushin.xml
it-life-hack.xml
kaden-channel.xml
livedoor-homme.xml
movie-enter.xml
peachy.xml
smax.xml
sports-watch.xml
topic-news.xml
Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Get:5 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release [564 B]
Get:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release.gpg [833 B]
Get:9 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [43.2 kB]
Hit:10 http://ar

--2021-01-09 06:32:33--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 172.67.9.4, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1279641604 (1.2G) [binary/octet-stream]
Saving to: ‘cc.ja.300.vec.gz’

     0K .......... .......... .......... .......... ..........  0%  112K 3h5m
    50K .......... .......... .......... .......... ..........  0% 13.4M 93m22s
   100K .......... .......... .......... .......... ..........  0%  225K 93m6s
   150K .......... .......... .......... .......... ..........  0%  115M 69m52s
   200K .......... .......... .......... .......... ..........  0%  226K 74m17s
   250K .......... .......... .......... .......... ..........  0% 45.2M 61m59s
   300K .......... .......... .......... .......... ..........  0%  122M 53m9s
   350K .......... .

In [2]:
'''
-----------------------------------------------------
データの前処理
-----------------------------------------------------
'''

# 1. XMLからのテキスト抽出

import glob
import xml.etree.ElementTree as ET

def get_data(file_name, target):
    data = list()
    tree = ET.parse(file_name)
    for doc in tree.getroot():
        for element in doc:
            if element.attrib["name"] == target:
                data.append(element.text)
    return data

titles, labels = list(), list()
for file_name in sorted(glob.glob("dataset/*.xml")):
    titles.extend(get_data(file_name, target="title"))
    labels.extend(get_data(file_name, target="cat"))


# 2. 単語分割

import MeCab
mecab = MeCab.Tagger()

def morphological_analysis(sentence, mecab):
    surfaces = list()
    features = list()
    node = mecab.parseToNode(sentence)
    while node:
        surfaces.append(node.surface)
        features.append(node.feature)
        node = node.next
    return surfaces[1:-1], features[1:-1]

texts = list()
for title in titles:
    texts.append(" ".join(morphological_analysis(title, mecab)[0]))


# 3. 訓練用／検証用／評価用に分割

import numpy as np
np.random.seed(seed=42)

def shuffle(list1, list2):
    tmp = list(zip(list1, list2))
    np.random.shuffle(tmp)
    list1, list2 = zip(*tmp)
    return list(list1), list(list2)

texts, labels = shuffle(texts, labels)
texts_train, labels_train = texts[:5000], labels[:5000]
texts_dev, labels_dev = texts[5000:6000], labels[5000:6000]
texts_test, labels_test = texts[6000:7000], labels[6000:7000]


# タイトルとカテゴリの確認

print("カテゴリ: %s" % labels_train[0])
print("タイトル: %s\n" % texts_train[0])
print("カテゴリ: %s" % labels_dev[0])
print("タイトル: %s\n" % texts_dev[0])
print("カテゴリ: %s" % labels_test[0])
print("タイトル: %s\n" % texts_test[0])

カテゴリ: sports-watch
タイトル: 猫 ひろし の 五輪 出場 暗転 、 為 末 は 「 陸上 選手 の 大半 の 反応 は 、 え 、 それ を アウト に し ちゃう の ？ 」

カテゴリ: smax
タイトル: ソフトバンク 、 史上 最速 下り 最大 110 Mbps に 対応 し た モバイル Wi - Fi ルーター 「 ULTRA WiFi 4 G 102 HW 」 を 発表 ！ SoftBank 4 G および ULTRA SPEED に 対応

カテゴリ: topic-news
タイトル: 「 ZIP !」 スタッフ に 東野 幸治 が 怒り 「 謝罪 を 求め ます 。 」



In [3]:
'''
-----------------------------------------------------
SWEM
    1. 単語ベクトルの読み込み
    2. 単語ベクトルを平均
-----------------------------------------------------
'''

# 0. 単語ベクトルのファイルから不要な単語を削除（全部読み込むと10分ぐらいかかるので）

vocab = set()
for word_list in texts:
    for word in word_list.split():
        vocab.add(word)

fin = open("/content/cc.ja.300.vec", "r")
vectors = list()
for line in fin:
    if line.split()[0] in vocab:
        vectors.append(line)
fin.close()

fout = open("/content/cc.ja.300-livedoornews.vec", "w")
fout.write("%d 300\n" % len(vectors))
for vector in vectors:
    fout.write(vector)
fout.close()


# 1. 単語ベクトルの読み込み

from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format("/content/cc.ja.300-livedoornews.vec")
w2v_vocab = set(w2v.vocab.keys())


# 2. 単語ベクトルを平均

def swem_vectorize(texts, w2v, w2v_vocab):
    vectors = list()
    for words in texts:
        w2vs = list()
        for word in words.split():
            if word in w2v_vocab:
                w2vs.append(w2v[word])
        vectors.append(np.array(w2vs).mean(axis=0))
    return vectors

X_train = swem_vectorize(texts_train, w2v, w2v_vocab)
X_dev = swem_vectorize(texts_dev, w2v, w2v_vocab)
X_test = swem_vectorize(texts_test, w2v, w2v_vocab)

print(len(X_train), len(X_train[0]), X_train[0])

5000 300 [ 4.65200059e-02 -2.20766664e-02  3.29766691e-01  1.39626682e-01
 -1.25566684e-02  2.66966689e-02  3.00766658e-02  3.89666762e-03
  1.93833299e-02 -1.46100009e-02  6.81300014e-02  9.60666593e-03
 -3.38566676e-02  3.11200060e-02 -8.83000046e-02 -3.99933383e-02
  1.17233340e-02 -3.98800001e-02 -1.07319988e-01  4.68999892e-03
  1.44966636e-02  6.87166601e-02 -1.12486690e-01 -9.22999997e-03
 -2.63466649e-02  4.31800038e-02  6.78133294e-02 -6.43600002e-02
 -5.92333358e-03  5.26333451e-02 -1.47646666e-01  2.60266699e-02
  5.41999657e-03 -9.20666941e-03  6.00399971e-02 -3.77133377e-02
  3.89666632e-02 -2.52533332e-02 -3.29000363e-03 -4.61999932e-03
  1.44166658e-02  6.93666376e-03 -2.76366677e-02  5.65333245e-03
 -8.09999823e-04 -1.45333316e-02  1.57333491e-03  3.74200009e-02
  6.10333271e-02 -4.55000484e-03 -5.53800017e-02 -1.13400007e-02
 -3.99666699e-03 -2.40900014e-02 -7.00999936e-03  1.40001372e-04
  1.39866667e-02 -4.48033251e-02 -1.18900007e-02  1.74166653e-02
  1.98300015e-02

In [4]:
'''
-----------------------------------------------------
分類器の訓練
-----------------------------------------------------
'''

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ラベルをIDに変換

label2id = dict()
for label in sorted(set(labels)):
    label2id[label] = len(label2id)
y_train = [label2id[label] for label in labels_train]
y_dev = [label2id[label] for label in labels_dev]
y_test = [label2id[label] for label in labels_test]


# 分類器の訓練

best_c, best_score = 0, 0
for c in [0.1, 1, 10]:
    classifier = LogisticRegression(C=c, max_iter=1000)
    classifier.fit(X_train, y_train)
    dev_acc = accuracy_score(y_dev, classifier.predict(X_dev))
    if best_score < dev_acc:
        best_score = dev_acc
        best_c = c
    print("Dev accuracy = %1.3f\tC = %s" % (dev_acc, str(c)))
print("Best parameter: C = %s" % str(best_c))

Dev accuracy = 0.675	C = 0.1
Dev accuracy = 0.778	C = 1
Dev accuracy = 0.790	C = 10
Best parameter: C = 10


In [5]:
'''
-----------------------------------------------------
評価
-----------------------------------------------------
'''

classifier = LogisticRegression(C=best_c, max_iter=1000)
classifier.fit(X_train, y_train)
test_acc = accuracy_score(y_test, classifier.predict(X_test))
print("Test accuracy = %1.3f" % test_acc)

Test accuracy = 0.797
