In [1]:
%%bash

# データのダウンロード

mkdir dataset
curl -Ss https://www.rondhuit.com/download/livedoor-news-data.tar.gz > dataset/dataset.tar.gz
cd dataset
tar -xvf dataset.tar.gz
rm dataset.tar.gz
cd ../


# 形態素解析器(MeCab)のインストール

apt-get update
apt-get install mecab file swig libmecab-dev mecab-ipadic-utf8
pip install mecab-python3==0.996.5

dokujo-tsushin.xml
it-life-hack.xml
kaden-channel.xml
livedoor-homme.xml
movie-enter.xml
peachy.xml
smax.xml
sports-watch.xml
topic-news.xml
Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Get:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release [564 B]
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release.gpg [833 B]
Get:10 http://archive.ubuntu.com/ubuntu bioni

In [2]:
'''
-----------------------------------------------------
データの前処理
-----------------------------------------------------
'''

# 1. XMLからのテキスト抽出

import glob
import xml.etree.ElementTree as ET

def get_data(file_name, target):
    data = list()
    tree = ET.parse(file_name)
    for doc in tree.getroot():
        for element in doc:
            if element.attrib["name"] == target:
                data.append(element.text)
    return data

titles, labels = list(), list()
for file_name in sorted(glob.glob("dataset/*.xml")):
    titles.extend(get_data(file_name, target="title"))
    labels.extend(get_data(file_name, target="cat"))


# 2. 単語分割

import MeCab
mecab = MeCab.Tagger()

def morphological_analysis(sentence, mecab):
    surfaces = list()
    features = list()
    node = mecab.parseToNode(sentence)
    while node:
        surfaces.append(node.surface)
        features.append(node.feature)
        node = node.next
    return surfaces[1:-1], features[1:-1]

texts = list()
for title in titles:
    texts.append(" ".join(morphological_analysis(title, mecab)[0]))


# 3. 訓練用／検証用／評価用に分割

import numpy as np
np.random.seed(seed=42)

def shuffle(list1, list2):
    tmp = list(zip(list1, list2))
    np.random.shuffle(tmp)
    list1, list2 = zip(*tmp)
    return list(list1), list(list2)

texts, labels = shuffle(texts, labels)
texts_train, labels_train = texts[:5000], labels[:5000]
texts_dev, labels_dev = texts[5000:6000], labels[5000:6000]
texts_test, labels_test = texts[6000:7000], labels[6000:7000]


# タイトルとカテゴリの確認

print("カテゴリ: %s" % labels_train[0])
print("タイトル: %s\n" % texts_train[0])
print("カテゴリ: %s" % labels_dev[0])
print("タイトル: %s\n" % texts_dev[0])
print("カテゴリ: %s" % labels_test[0])
print("タイトル: %s\n" % texts_test[0])

カテゴリ: sports-watch
タイトル: 猫 ひろし の 五輪 出場 暗転 、 為 末 は 「 陸上 選手 の 大半 の 反応 は 、 え 、 それ を アウト に し ちゃう の ？ 」

カテゴリ: smax
タイトル: ソフトバンク 、 史上 最速 下り 最大 110 Mbps に 対応 し た モバイル Wi - Fi ルーター 「 ULTRA WiFi 4 G 102 HW 」 を 発表 ！ SoftBank 4 G および ULTRA SPEED に 対応

カテゴリ: topic-news
タイトル: 「 ZIP !」 スタッフ に 東野 幸治 が 怒り 「 謝罪 を 求め ます 。 」



In [3]:
'''
-----------------------------------------------------
Bag of Words
-----------------------------------------------------
'''

from collections import defaultdict


# 1. 単語の頻度をカウント

word2freq = defaultdict(int)
for text in texts_train:
    for word in text.split():
        word2freq[word] += 1
print("語彙サイズ: %d" % len(word2freq.keys()))


# 2. 語彙制限（高頻度な500単語に制限）

vocab = list()
for word, freq in sorted(word2freq.items(), key=lambda x: x[1], reverse=True)[:500]:
    vocab.append(word)
print("語彙サイズ: %d" % len(vocab))


# 3. 各単語にIDを割り当てる

word2id = dict()
for word in vocab:
    word2id[word] = len(word2id)
print("単語 iPhone のID: %s" % word2id["iPhone"])


# 4. テキストをベクトル化する

def bow_vectorize(texts, vocab, word2id):
    vectors = list()
    for text in texts:
        vector = np.zeros(len(vocab))
        for word in text.split():
            if word in vocab:
                vector[word2id[word]] = 1
        vectors.append(vector)
    return vectors

X_train = bow_vectorize(texts_train, vocab, word2id)
X_dev = bow_vectorize(texts_dev, vocab, word2id)
X_test = bow_vectorize(texts_test, vocab, word2id)

print(len(X_train), len(X_train[0]), X_train[0])

語彙サイズ: 12455
語彙サイズ: 500
単語 iPhone のID: 51
5000 500 [1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.


In [4]:
'''
-----------------------------------------------------
TF-IDF
    1. 文書頻度の計算
    2. TF-IDFの計算
    3. BoWの重み付け
-----------------------------------------------------
'''

import math


# 1. 文書頻度の計算

n = len(texts_train)
word2df = defaultdict(int)
for text in texts_train:
    for word in set(text.split()):
        word2df[word] += 1


# 2. TFIDFの計算
def tfidf(word, word_list, word2df, n):
    return (word_list.count(word) / len(word_list)) * math.log(n / word2df[word])


# 3. BoWの重み付け

def tfidf_vectorize(texts, vocab, word2id, word2df, n):
    vectors = list()
    for text in texts:
        word_list = text.split()
        vector = np.zeros(len(vocab))
        for word in word_list:
            if word in vocab:
                vector[word2id[word]] = tfidf(word, word_list, word2df, n)
        vectors.append(vector)
    return vectors

X_train = tfidf_vectorize(texts_train, vocab, word2id, word2df, n)
X_dev = tfidf_vectorize(texts_dev, vocab, word2id, word2df, n)
X_test = tfidf_vectorize(texts_test, vocab, word2id, word2df, n)

print(len(X_train), len(X_train[0]), X_train[0])

5000 500 [0.06084499 0.02824104 0.03228791 0.         0.03126751 0.0318864
 0.         0.         0.         0.12167204 0.         0.09508556
 0.         0.         0.06478377 0.         0.         0.
 0.         0.07583233 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.       

In [5]:
'''
-----------------------------------------------------
分類器の訓練
-----------------------------------------------------
'''

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ラベルをIDに変換

label2id = dict()
for label in sorted(set(labels)):
    label2id[label] = len(label2id)
y_train = [label2id[label] for label in labels_train]
y_dev = [label2id[label] for label in labels_dev]
y_test = [label2id[label] for label in labels_test]


# 分類器の訓練

best_c, best_score = 0, 0
for c in [0.1, 1, 10]:
    classifier = LogisticRegression(C=c, max_iter=1000)
    classifier.fit(X_train, y_train)
    dev_acc = accuracy_score(y_dev, classifier.predict(X_dev))
    if best_score < dev_acc:
        best_score = dev_acc
        best_c = c
    print("Dev accuracy = %1.3f\tC = %s" % (dev_acc, str(c)))
print("Best parameter: C = %s" % str(best_c))

Dev accuracy = 0.712	C = 0.1
Dev accuracy = 0.772	C = 1
Dev accuracy = 0.777	C = 10
Best parameter: C = 10


In [6]:
'''
-----------------------------------------------------
評価
-----------------------------------------------------
'''

classifier = LogisticRegression(C=best_c, max_iter=1000)
classifier.fit(X_train, y_train)
test_acc = accuracy_score(y_test, classifier.predict(X_test))
print("Test accuracy = %1.3f" % test_acc)

Test accuracy = 0.790
