In [113]:
import numpy as np
import pandas as pd
import jieba
import re
import collections
import matplotlib.pyplot as plt
from pylab import rcParams
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [4]:
# Customizing plots with style 
rcParams['figure.figsize'] = 10, 5
rcParams['lines.linewidth'] = 2
plt.style.use('ggplot')

In [10]:
def filter_outlier_by_article(df, std_num):
    df = df.dropna()
    std = df["POST_CONTENT"].str.len().std()
    mean = df["POST_CONTENT"].str.len().mean()
    upper = mean + std_num*std
    return df.loc[df["POST_CONTENT"].str.len()<upper,:]

In [36]:
def filter_not_chinese_word(document):
# 只取中文
    try:
        document = "".join(re.findall(r"[\u4e00-\u9fa5]+", document))
        return document
    except Exception as e:
        print("{}, index {}".format(str(e), index))

In [30]:
def tokenize_document(doc, stop_word_list):
    preprocessed_document = jieba.cut(doc)
    # 去除保留字
    preprocessed_document = list(filter(lambda x: x not in stop_word_list, preprocessed_document))
    return preprocessed_document

In [192]:
# 取得該群的所有文檔
def get_document_by_cluster(corpus, cluster_labels, cluster_id):
    cluster_corpus = []
    for index, label in enumerate(cluster_labels, 0):
        if label == cluster_id:
            cluster_corpus.append(corpus[index])
    return cluster_corpus

In [264]:
#  統計文檔關鍵字
def count_doc_word_freq(docs):
    word_list = []
    for doc in docs:
        for word in doc:
            word_list.append(word)
    counter = collections.Counter(word_list)
    return counter.most_common()[:5]

In [266]:
def get_topic_tag(test_doc, train_corpus, model, topic_num=10):
    # 取得文檔向量
    vectors = model.docvecs.vectors_docs
    # 分群
    k_mean = KMeans(n_clusters=topic_num).fit(vectors)
    cluster_labels = k_mean.labels_
    # 建立測試向量
    doc_vector = model.infer_vector(test_doc).reshape(1,50)
    # 分群來去標示主題
    cluster_id = int(k_mean.predict(test_doc_vector))
    # 取的分群文檔
    cluster_docs = get_document_by_cluster(train_corpus, cluster_labels, cluster_id)
    # 統計主題，自動取得文章的 tags
    term_feq_list = count_doc_word_freq(cluster_docs)
    return term_feq_list

In [125]:
# 收集自維基百科
with open("data/ref_text_tw.txt", "r", encoding="utf-8") as content:
    content_list = [line.strip().replace(' ', '') for line in content]

In [131]:
# 用來存放分詞後的結果
preprocessed_documents = []
# stopword
with open("data/jieba_dict/stopwords.txt") as stop_words:
    stop_word_list = [stop_word.strip() for stop_word in stop_words]
# 支援繁體中文較好的詞庫
jieba.set_dictionary("data/jieba_dict/dict.txt.big")
jieba.load_userdict("data/jieba_dict/中央機構.dict")
jieba.load_userdict("data/jieba_dict/名人錄.dict")
jieba.load_userdict("data/jieba_dict/專有名詞.dict")
jieba.load_userdict("data/jieba_dict/縣市區鄉鎮.dict")

for index, document in enumerate(content_list, 0):
    if index % 2000 == 0:
        print("current document index:{}".format(index))
    document = filter_not_chinese_word(document)
    preprocessed_document = tokenize_document(document, stop_word_list)
    preprocessed_documents.append(preprocessed_document)

Building prefix dict from /Users/Mark1002/Desktop/project/python/nlp-experiment/data/jieba_dict/dict.txt.big ...
Loading model from cache /var/folders/dw/m2zgs87j3x19nl8mnfy3fs8c0000gn/T/jieba.ud2b054c4d13e51557150f7d36ba5f4d0.cache
Loading model cost 1.697 seconds.
Prefix dict has been built succesfully.


current document index:0
current document index:2000
current document index:4000
current document index:6000
current document index:8000
current document index:10000
current document index:12000
current document index:14000
current document index:16000
current document index:18000
current document index:20000
current document index:22000
current document index:24000
current document index:26000
current document index:28000
current document index:30000
current document index:32000


In [152]:
import random

# 切分訓練與測試資料集
random.shuffle(preprocessed_documents)
train_corpus = preprocessed_documents[:int(len(preprocessed_documents)*0.8)]
test_corpus = preprocessed_documents[int(len(preprocessed_documents)*0.8):]

In [153]:
print("train set length: {}, test set length: {}".format(len(train_corpus), len(test_corpus)))

train set length: 27094, test set length: 6774


In [154]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_corpus)]

In [155]:
documents[:5]

[TaggedDocument(words=['年', '慶祝', '新世紀', '到來', '太陽', '王', '路易十四', '法國', '凡爾賽宮', '金碧輝煌', '大廳', '裡', '舉行', '盛大', '舞會', '時', '曾', '身著', '中國式', '服裝', '坐在', '一頂', '中國式', '八擡大轎', '裡', '出場', '全場', '發出', '一片', '驚歎', '聲'], tags=[0]),
 TaggedDocument(words=['年', '月', '日本', '國家隊', '主教練', '伊維卡', '奧西姆', '腦梗塞', '入院', '搶救', '岡田武史', '再次', '臨危受命', '接替', '奧西姆', '成為', '國家隊', '主教練'], tags=[1]),
 TaggedDocument(words=['朗奴', '卡達', '連拿馬田', '奧斯禾', '達', '路斯', '荷蘭', '足球', '運動員', '司職', '守門員'], tags=[2]),
 TaggedDocument(words=['擔任', '羅馬', '國王', '安', '庫斯', '馬', '基', '烏斯', '王子', '監護人', '身份', '國王', '死', '後', '奪取', '王位'], tags=[3]),
 TaggedDocument(words=['石', '碏', '死後由石', '駘', '仲石祁子', '父子', '相繼', '繼承', '但石', '駘', '仲', '石', '碏', '關係', '不詳', '僅知', '同族', '一說', '石', '碏', '從子', '一說', '石', '碏', '之孫'], tags=[4])]

In [156]:
# train doc2vec model
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [61]:
# 載入預訓練好的 doc2vec 模型
model = Doc2Vec.load("data/Doc2Vec_v1.4/d2v.model.bin")



In [185]:
# 分群，每一群代表一個 topic
from sklearn.cluster import KMeans
# 要定義分群數
topic_num = 10
vectors = model.docvecs.vectors_docs
k_mean = KMeans(n_clusters=topic_num).fit(vectors)
cluster_label = k_mean.labels_

In [186]:
# 分群統計
collections.Counter(cluster_label)

Counter({0: 1549,
         1: 8055,
         2: 1638,
         3: 1807,
         4: 1006,
         5: 1668,
         6: 3126,
         7: 1923,
         8: 3935,
         9: 2387})

In [241]:
# 建立測試向量
test_doc_vector = model.infer_vector(test_corpus[1322]).reshape(1, 50)
# 分群來去標示主題
cluster_result = int(k_mean.predict(test_doc_vector))

In [242]:
print("第 {} 群".format(cluster_result))

第 9 群


In [243]:
test_corpus[1322]

['外界',
 '辛頓',
 '印象',
 '一名',
 '早熟',
 '早殘',
 '球員',
 '蓋',
 '早',
 '年',
 '已',
 '當過',
 '荷蘭',
 '聯賽',
 '最佳',
 '球員',
 '並於',
 '年',
 '世界盃',
 '後',
 '轉投',
 '西甲',
 '超級',
 '球會',
 '巴塞羅那']

In [245]:
model.docvecs.most_similar(test_doc_vector)

  if np.issubdtype(vec.dtype, np.int):


[(4026, 0.6248799562454224),
 (11329, 0.5510990619659424),
 (21162, 0.5127044916152954),
 (21106, 0.5006772875785828),
 (25424, 0.4872778058052063),
 (25924, 0.48405373096466064),
 (6550, 0.47391277551651),
 (11439, 0.4514860510826111),
 (14484, 0.4501854479312897),
 (9417, 0.44867730140686035)]

In [248]:
train_corpus[4026]

['辛頓',
 '早',
 '年',
 '已',
 '當過',
 '荷蘭',
 '聯賽',
 '最佳',
 '球員',
 '參加',
 '過年',
 '世界盃',
 '足球賽',
 '年',
 '世界盃',
 '之後',
 '加盟',
 '西甲',
 '超級',
 '球會',
 '巴塞隆',
 '足球',
 '巴塞羅那']

In [250]:
cluster_label[4026]

9

In [251]:
# 取得該群的所有文章
cluster_docs = get_document_by_cluster(train_corpus, cluster_label, cluster_result)

In [265]:
# 統計主題，自動取得文章的 tags
count_doc_word_freq(cluster_docs)

[('年', 1095), ('足球', 957), ('運動員', 349), ('日本', 342), ('國家足球隊', 293)]

In [268]:
# 自動取得文章 tag
get_topic_tag(test_corpus[1322], train_corpus, model)

[('年', 1173), ('足球', 983), ('日本', 378), ('運動員', 360), ('國家足球隊', 325)]