# ベクトル化された小説データに対するSVMを用いたタグ分類 (出現率上位100件のタグすべてに実行)

In [1]:
import gensim
import json
import numpy as np
import os
import collections
from sklearn import svm
from sklearn import cross_validation
from sklearn.externals import joblib
from sklearn.externals.joblib import Parallel, delayed

In [2]:
model = gensim.models.doc2vec.Doc2Vec.load('novel2vec.model')

In [3]:
top_tags = json.load(open('top-tag-ids.json'))

In [4]:
tag_names = top_tags.keys()

In [5]:
def fit_svm(tag_name):
    target_novel_ids = top_tags[tag_name]
    
    X = []
    y = []
    ids = []
    for index, vector in enumerate(model.docvecs):
        X.append(vector)
        novel_id = model.docvecs.index_to_doctag(index)
        ids.append(novel_id)
        if int(novel_id) in target_novel_ids:
            y.append(1)
        else:
            y.append(0)

    X_train, X_test, y_train, y_test, ids_train, ids_test = cross_validation.train_test_split(X, y, ids, test_size=0.2, random_state=10)
    clf = svm.SVC(kernel='linear').fit(X_train, y_train)
    
    if not os.path.exists('svms/%s' % tag_name):
        os.makedirs('svms/%s' % tag_name)
    joblib.dump(clf, 'svms/%s/classifier.pkl' % tag_name)
    
    y_predict = clf.predict(X_test)
    test_counter = collections.Counter(y_test)
    
    false_negatives = 0
    false_positives = 0
    true_negatives = 0
    true_positives = 0
    false_positive_ids = []
    false_negative_ids = []
    for index, predict in enumerate(y_predict):
        test = y_test[index]
        if test == 0 and predict == 1:
            false_positive_ids.append(ids_test[index])
            false_positives += 1
        if test == 1 and predict == 0:
            false_negative_ids.append(ids_test[index])
            false_negatives += 1
        if test == 1 and predict == 1:
            true_positives += 1
        if test == 0 and predict == 0:
            true_negatives += 1
            
    print('Successfully trained %s SVM' % tag_name)
            
    return {
        'tag_name': tag_name,
        'false_negatives': false_negatives,
        'false_positives': false_positives,
        'true_negatives': true_negatives,
        'true_positives': true_positives,
        'false_negative_ids': false_negative_ids,
        'false_positive_ids': false_positive_ids,
    }

In [6]:
%time infos = Parallel(n_jobs=36)(delayed(fit_svm)(tag_name) for tag_name in tag_names)

Successfully trained 図書館戦争 SVM
Successfully trained とうらぶちゃんねる SVM
Successfully trained 進撃の腐人 SVM
Successfully trained チョロ松 SVM
Successfully trained 文学 SVM
Successfully trained 創作 SVM
Successfully trained ワートリ【腐】 SVM
Successfully trained オメガバース SVM
Successfully trained 死ネタ SVM
Successfully trained 詩 SVM
Successfully trained パロディ SVM
Successfully trained 冒険 SVM
Successfully trained ラブライブ! SVM
Successfully trained R-15 SVM
Successfully trained 燭台切光忠 SVM
Successfully trained アイドルマスターシンデレラガールズ SVM
Successfully trained 続きを全裸待機 SVM
Successfully trained 銀魂 SVM
Successfully trained アイドリッシュセ腐ン SVM
Successfully trained 鶴丸国永 SVM
Successfully trained 一松 SVM
Successfully trained 男審神者 SVM
Successfully trained やはり俺の青春ラブコメはまちがっている。 SVM
Successfully trained ヒロアカ【腐】 SVM
Successfully trained カラ一 SVM
Successfully trained ブラック本丸 SVM
Successfully trained おそチョロ SVM
Successfully trained BL SVM
Successfully trained 赤井秀一 SVM
Successfully trained BL松小説100users入り SVM
Successfully trained サンプル SVM
Successfully trai

In [7]:
with open('classifier-info.json', 'w') as outfile:
    json.dump(infos, outfile)