In [1]:
import numpy as np
import pandas as pd
import jieba
import re
import collections
import matplotlib.pyplot as plt
from pylab import rcParams
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
# Customizing plots with style 
rcParams['figure.figsize'] = 10, 5
rcParams['lines.linewidth'] = 2
plt.style.use('ggplot')

In [3]:
def filter_outlier_by_article(df, std_num):
    df = df.dropna()
    std = df["POST_CONTENT"].str.len().std()
    mean = df["POST_CONTENT"].str.len().mean()
    upper = mean + std_num*std
    return df.loc[df["POST_CONTENT"].str.len()<upper,:]

In [4]:
def filter_not_chinese_word(document):
# 只取中文
    try:
        document = "".join(re.findall(r"[\u4e00-\u9fa5]+", document))
        return document
    except Exception as e:
        print("{}".format(str(e)))

In [5]:
def tokenize_document(doc, stop_word_list):
    preprocessed_document = jieba.cut(doc)
    # 去除保留字
    preprocessed_document = list(filter(lambda x: x not in stop_word_list, preprocessed_document))
    return preprocessed_document

In [6]:
#  統計文檔關鍵字
def count_doc_word_freq(docs):
    word_list = []
    for doc in docs:
        for word in doc:
            word_list.append(word)
    counter = collections.Counter(word_list)
    return counter.most_common()[:5]

In [7]:
def get_topic_tag(docs, d2v_model, classifier, topic_list):
    # 確保單一記錄維度預測格式正確
    if np.ndim(docs) == 1:
        docs = [docs]
    # 預測測試資料準確度，使用 infer vector
    doc_vector = np.array([d2v_model.infer_vector(doc) for doc in docs])
    # 預測測試文章分類
    predicted_result = [dict(zip(topic_list, pred)) for pred in np.round(classifier.predict_proba(doc_vector),3)]
    return predicted_result

In [8]:
def get_topic_tag_tf_idf(docs, tf_idf, classifier, topic_list):
    # 確保單一記錄維度預測格式正確
    if np.ndim(docs) == 1:
        docs = [docs]
    docs_tf_idf = convert_tf_idf_corpus(docs)
    tf_idf_vector = tf_idf.transform(docs_tf_idf)
    # 預測測試文章分類
    predicted_result = [dict(zip(topic_list, pred)) for pred in np.round(classifier.predict_proba(tf_idf_vector),3)]
    return predicted_result

In [9]:
def preprocess_text_data(raw_docs):
    # 用來存放分詞後的結果
    preprocessed_documents = []
    # stopword
    with open("data/jieba_dict/stopwords.txt") as stop_words:
        stop_word_list = [stop_word.strip() for stop_word in stop_words]
    # 支援繁體中文較好的詞庫
    jieba.set_dictionary("data/jieba_dict/dict.txt.big")
    jieba.load_userdict("data/jieba_dict/中央機構.dict")
    jieba.load_userdict("data/jieba_dict/名人錄.dict")
    jieba.load_userdict("data/jieba_dict/專有名詞.dict")
    jieba.load_userdict("data/jieba_dict/縣市區鄉鎮.dict")

    for index, document in enumerate(raw_docs, 0):
        if index % 2000 == 0:
            print("current document index:{}".format(index))
        # 去除非中文字    
        document = filter_not_chinese_word(document)
        # 分詞與去掉保留字
        document = tokenize_document(document, stop_word_list)
        preprocessed_documents.append(document)
    return preprocessed_documents

In [10]:
def sample_record_by_label(raw_df, num):
    temp_df = pd.DataFrame()
    raw_df = shuffle(raw_df)
    labels = raw_df.groupby('label').size().index.values
    for label in labels:
        temp_df = temp_df.append(raw_df.loc[raw_df["label"]==label,:].iloc[:num])  
    return shuffle(temp_df)

In [11]:
def convert_tf_idf_corpus(corpus):
    return [" ".join(doc) for doc in corpus]

In [13]:
# 載入不同主題資料
topic_list = ["政治", "科技", "娛樂", "體育", "社會", "財經", "健康", "國際"]
raw_df = pd.DataFrame()

for index, topic in enumerate(topic_list, 0):
    with open("data/text/big_data/corpus/" + topic + ".txt", "r", encoding="utf-8") as content:
        content_list = [line.strip().replace(' ', '') for line in content]
    temp_df = pd.DataFrame(content_list, columns=['content'])
    temp_df['label'] = index
    raw_df = raw_df.append(temp_df)

In [14]:
raw_df = sample_record_by_label(raw_df, 20000)

In [15]:
X = raw_df['content'].values
y = raw_df['label'].values
# 切分訓練與測試
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [16]:
# 統計各類次數
print(collections.Counter(y_train))
print(collections.Counter(y_test))

Counter({0: 18025, 2: 18022, 6: 18021, 4: 18014, 1: 18013, 7: 18003, 5: 17989, 3: 17913})
Counter({3: 2087, 5: 2011, 7: 1997, 1: 1987, 4: 1986, 6: 1979, 2: 1978, 0: 1975})


In [16]:
X_train_preprocessed = preprocess_text_data(X_train)

Building prefix dict from /Users/Mark1002/Desktop/project/python/nlp-experiment/data/jieba_dict/dict.txt.big ...
Dumping model to file cache /var/folders/dw/m2zgs87j3x19nl8mnfy3fs8c0000gn/T/jieba.ud2b054c4d13e51557150f7d36ba5f4d0.cache
Loading model cost 3.269 seconds.
Prefix dict has been built succesfully.


current document index:0
current document index:2000
current document index:4000
current document index:6000
current document index:8000
current document index:10000
current document index:12000
current document index:14000
current document index:16000
current document index:18000
current document index:20000
current document index:22000
current document index:24000
current document index:26000
current document index:28000
current document index:30000
current document index:32000
current document index:34000
current document index:36000
current document index:38000
current document index:40000
current document index:42000
current document index:44000
current document index:46000
current document index:48000
current document index:50000
current document index:52000
current document index:54000
current document index:56000
current document index:58000
current document index:60000
current document index:62000
current document index:64000
current document index:66000
current document index

In [17]:
X_test_preprocessed = preprocess_text_data(X_test)

Building prefix dict from /Users/Mark1002/Desktop/project/python/nlp-experiment/data/jieba_dict/dict.txt.big ...
Loading model from cache /var/folders/dw/m2zgs87j3x19nl8mnfy3fs8c0000gn/T/jieba.ud2b054c4d13e51557150f7d36ba5f4d0.cache
Loading model cost 1.637 seconds.
Prefix dict has been built succesfully.


current document index:0
current document index:2000
current document index:4000
current document index:6000
current document index:8000
current document index:10000
current document index:12000
current document index:14000


In [18]:
print("train set length: {}, test set length: {}".format(len(X_train_preprocessed), len(X_test_preprocessed)))

NameError: name 'X_train_preprocessed' is not defined

In [19]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train_preprocessed)]

In [20]:
len(documents)

144000

In [53]:
# train doc2vec model
model = Doc2Vec(vector_size=100, window=10, min_count=5, workers=2, epochs=20)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
model.save("data/model/auto_tag/d2v.model")

In [21]:
d2v_model = Doc2Vec.load("data/model/auto_tag/d2v.model")

In [63]:
# doc2vec 文章分類
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.externals import joblib

# classifier = RandomForestClassifier()
classifier = linear_model.LogisticRegression()
classifier.fit(d2v_model.docvecs.vectors_docs, y_train)
joblib.dump(classifier, 'data/model/auto_tag/classifier.pkl')

['data/model/auto_tag/classifier.pkl']

In [22]:
from sklearn.externals import joblib
# load classifier
classifier = joblib.load('data/model/auto_tag/classifier.pkl')

In [23]:
from sklearn.metrics import accuracy_score
# 訓練資料準確度
train_predict = classifier.predict(d2v_model.docvecs.vectors_docs)
accuracy_score(y_train, train_predict)

0.12502083333333333

In [29]:
import time
start_time = time.time()
# 預測測試資料準確度，使用 infer vector
test_X_vecter = np.array([d2v_model.infer_vector(doc) for doc in X_test_preprocessed]) 
# 預測測試文章分類
test_predict = classifier.predict(test_X_vecter)
print("acc: {}, spend time: {}".format(accuracy_score(y_test, test_predict), time.time() - start_time))

acc: 0.674125, spend time: -198.08531403541565


In [31]:
np.round(classifier.predict_proba(d2v_model.docvecs.vectors_docs[:1]), 3)

array([[0.002, 0.036, 0.001, 0.007, 0.001, 0.786, 0.04 , 0.129]])

In [172]:
# 測試資料
(topic_list[y_test[22]], X_test[22])

('健康',
 '好醫師新聞網記者邊建元／台東報導圖：台東馬偕直腸外科洪毓廷醫師俗語說「十男九痔，有痔瘡的女人也不少」，可見痔瘡是個非常普遍的惱人隱疾。一位收容人，自行以釣魚線綁在痔瘡上，因疼痛難耐被送來就醫，檢查時痔瘡已嚴重腫脹壞死，已無法以藥物治療，必須開刀切除才痊癒。台東馬偕直腸外科洪毓廷呼籲，民眾若有肛門相關問題，包括會痛、會流血或滲液、會癢，都應尋找專科醫師檢查治療，不應自行尋求偏方以免延誤治療。雖然痔瘡與癌症無關，但臨床上常因懷疑是痔瘡卻查出更多複雜的疾病，甚至是癌症，不可不慎。台東馬偕的毓廷是台東唯一的直腸外科醫師。他說，痔瘡的功能就好像是水龍頭裡的橡皮墊，控制直腸內的氣體和液體，若平日排便習慣不佳、肛門血管循環變差，易造成痔瘡問題。在門診的個案中，有一半都是來看痔瘡疾病，病人都是因為患部持續流血、疼痛或造成身體不適才會就醫。這種疾病不分族群不分年齡，但女性因經歷懷孕生產，所以罹患比率高於男性；職業上則以粗重工作及長期坐辦公室居多，可能伴隨人一輩子，許多人則與它和平共處。洪毓廷指出，痔瘡一般分為「內痔」、「外痔」、「混合痔」，通常會痛的都是外痔發作。痔瘡也依脫出的程度可分為四度，一般來說大部分的痔瘡採取局部藥物塗抹治療即能有效控制症狀，若有以下三個狀況該考慮開刀，一是頻繁無法控制的出血，二是反覆的發作或是嚴重的疼痛無法被改善，再來是持續的脫出或是異物感，治療的選擇建議與專科醫師做詳細的討論。外科方法的治療包括橡皮筋結紮術，電燒，雷射或冷凍治療，超音波導引肛門血管結紮術，環狀切除術，微創痔瘡切除術及傳統的內外至全切除手術，一般而言痔瘡切除後並不會造成肛門失禁，手術後一週間常會因疼痛或感覺異常造成不同程度的排便困難或是輕微水便失禁等情形，而老人家肛門肌肉較鬆弛容易在手術後有肛門關不住的感覺，至於手術方法的選擇應諮詢專業醫師的建議。洪醫師也進一步提出痔瘡的預防及保健，電視上廣告中所提的「不要坐太久、天天有蔬果、少辣少油炸、飯後走一走」，雖然洗腦卻是要落實在生活中。另外，現今３C產品充斥生活，很多人常帶手機進廁所，一待待半小時，過於擠壓肛門會使痔瘡惡化，建議上廁所時間不要超過十分鐘。痔瘡患者常常會有解不乾淨的感覺，這是因為蹲坐時間太長導致痔瘡腫脹，感覺上會以為是還沒解乾淨，如果再持續用力就會把痔瘡擠出肛門，造成急性血栓性痔瘡。如果有這種情形，建議離開廁所走一走

In [19]:
# 使用 tf-idf 來分類
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
# 轉換 tf-idf 格式
# X_train_tf_idf = convert_tf_idf_corpus(X_train_preprocessed)
X_test_tf_idf = convert_tf_idf_corpus(X_test_preprocessed)

In [121]:
# 定義 tf-idf 模型
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2)
tfidf = vectorizer.fit(X_train_tf_idf)

In [179]:
joblib.dump(tfidf, 'data/model/auto_tag/tf-idf.pkl')

['data/model/auto_tag/tf-idf.pkl']

In [22]:
from sklearn.externals import joblib
tfidf = joblib.load('data/model/auto_tag/tf-idf.pkl')

In [26]:
# 轉換 tf-idf 特徵
# X_train_tf_idf_feature = tfidf.transform(X_train_tf_idf)
X_test_tf_idf_feature = tfidf.transform(X_test_tf_idf)

In [49]:
X_test_tf_idf_feature.shape

(16000, 326567)

In [48]:
len(tfidf.vocabulary_)

326567

In [182]:
from sklearn import linear_model

In [183]:
classifier = linear_model.LogisticRegression()
classifier.fit(X_train_tf_idf_feature, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [186]:
joblib.dump(classifier, 'data/model/auto_tag/tf-idf-classifier.pkl')

['data/model/auto_tag/tf-idf-classifier.pkl']

In [36]:
# load classifier
tf_idf_classifier = joblib.load('data/model/auto_tag/tf-idf-classifier.pkl')

In [37]:
from sklearn.metrics import accuracy_score
# 訓練資料準確度
train_predict = tf_idf_classifier.predict(X_train_tf_idf_feature)
accuracy_score(y_train, train_predict)

0.8942222222222223

In [38]:
import time
start_time = time.time()

test_predict = tf_idf_classifier.predict(X_test_tf_idf_feature)
print("acc: {}, spend time: {}".format(accuracy_score(y_test, test_predict), time.time() - start_time))

acc: 0.8885, spend time: 0.07359504699707031


In [174]:
tfidf.transform(X_test_preprocessed[22])

<407x326567 sparse matrix of type '<class 'numpy.float64'>'
	with 363 stored elements in Compressed Sparse Row format>

In [188]:
X_test_preprocessed[22]

['好',
 '醫師',
 '新聞網',
 '記者',
 '建元',
 '台東',
 '報導',
 '圖台',
 '東馬',
 '偕',
 '直腸',
 '外科',
 '洪毓廷',
 '醫師',
 '俗語說',
 '十男',
 '痔',
 '痔瘡',
 '女人',
 '痔瘡',
 '普遍',
 '惱人',
 '隱疾',
 '一位',
 '收容',
 '人',
 '自行',
 '釣魚',
 '線',
 '綁',
 '痔瘡',
 '疼痛',
 '難耐',
 '送來',
 '就醫',
 '檢查',
 '時',
 '痔瘡',
 '已',
 '嚴重',
 '腫脹',
 '壞死',
 '已',
 '無法',
 '藥物',
 '治療',
 '開刀',
 '切除',
 '痊癒',
 '台',
 '東馬',
 '偕',
 '直腸',
 '外科',
 '洪毓廷',
 '呼籲',
 '民眾',
 '肛門',
 '相關',
 '問題',
 '包括',
 '痛會',
 '流血',
 '滲液',
 '癢',
 '應',
 '尋找',
 '專科',
 '醫師',
 '檢查',
 '治療',
 '應',
 '自行',
 '尋求',
 '偏方',
 '延誤',
 '治療',
 '痔瘡',
 '癌症',
 '無關',
 '但臨',
 '床上',
 '常因',
 '懷疑',
 '痔瘡',
 '卻',
 '查出',
 '複雜',
 '疾病',
 '癌症',
 '不可',
 '不慎',
 '台',
 '東馬',
 '偕',
 '毓',
 '廷',
 '台東',
 '唯一',
 '直腸',
 '外科',
 '醫師',
 '說',
 '痔瘡',
 '功能',
 '好像',
 '水龍頭',
 '裡的',
 '橡皮',
 '墊',
 '控制',
 '直腸',
 '氣體',
 '液體',
 '平日',
 '排便',
 '習慣',
 '不佳',
 '肛門',
 '血管',
 '循環',
 '變差',
 '易',
 '造成',
 '痔瘡',
 '問題',
 '門診',
 '個案',
 '中有',
 '一半',
 '痔瘡',
 '疾病',
 '病人',
 '患部',
 '持續',
 '流血',
 '疼痛',
 '造成',
 '身體',
 '適才',
 '就醫',
 '這種',
 '疾病',
 '分',
 '族群',
 '分

In [189]:
get_topic_tag_tf_idf(X_test_preprocessed[22], tfidf, classifier, topic_list)

[{'健康': 0.946,
  '國際': 0.013,
  '娛樂': 0.004,
  '政治': 0.005,
  '社會': 0.02,
  '科技': 0.004,
  '財經': 0.005,
  '體育': 0.003}]