In [21]:
import jieba
import re
import numpy as np
import pandas as pd
import collections
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from gensim.models import Word2Vec

In [2]:
def preprocess_raw_docs(raw_docs):
    # 用來存放分詞後的結果
    docs = []
    # stopword
    with open("data/jieba_dict/stopwords.txt") as stop_words:
        stop_word_list = [stop_word.strip() for stop_word in stop_words]
    # 支援繁體中文較好的詞庫
    jieba.set_dictionary("data/jieba_dict/dict.txt.big")
    jieba.load_userdict("data/jieba_dict/中央機構.dict")
    jieba.load_userdict("data/jieba_dict/名人錄.dict")
    jieba.load_userdict("data/jieba_dict/專有名詞.dict")
    jieba.load_userdict("data/jieba_dict/縣市區鄉鎮.dict")

    for index, doc in enumerate(raw_docs, 0):
        if index % 2000 == 0:
            print("current document index:{}".format(index))
        # 去掉非英文中文數字    
        doc = filter_regx_word(doc)
        # 分詞
        doc = jieba.cut(doc)
        # 去掉保留字
        doc = list(filter(lambda x: x not in stop_word_list, doc))
        docs.append(doc)
    return docs

In [3]:
def filter_regx_word(document):
# 只取中文
    try:
        document = "".join(re.findall(r"[\u4e00-\u9fa5]", document))
        return document
    except Exception as e:
        print("{}".format(str(e)))

In [4]:
def sample_record_by_label(raw_df, num):
    temp_df = pd.DataFrame()
    raw_df = shuffle(raw_df)
    labels = raw_df.groupby('label').size().index.values
    for label in labels:
        temp_df = temp_df.append(raw_df.loc[raw_df["label"]==label,:].iloc[:num])  
    return shuffle(temp_df)

In [32]:
def construct_network_model(vocab_size, max_text_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_text_length))
    model.add(Flatten())
    model.add(Dense(8, activation='softmax'))
    return model

In [5]:
# 載入不同主題資料
topic_list = ["政治", "科技", "娛樂", "體育", "社會", "財經", "健康", "國際"]
raw_df = pd.DataFrame()

for index, topic in enumerate(topic_list, 0):
    with open("data/text/big_data/corpus/" + topic + ".txt", "r", encoding="utf-8") as content:
        content_list = [line for line in content]
    temp_df = pd.DataFrame(content_list, columns=['content'])
    temp_df['label'] = index
    raw_df = raw_df.append(temp_df)

raw_df = sample_record_by_label(raw_df, 20000)

In [6]:
X = raw_df['content'].values
y = raw_df['label'].values
# 切分訓練與測試
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [7]:
docs = preprocess_raw_docs(X_train[:6000])
docs = [" ".join(doc) for doc in docs]
labels = y_train[:6000]

Building prefix dict from /home/mark/Documents/python/nlp-experiment/data/jieba_dict/dict.txt.big ...
Loading model from cache /tmp/jieba.uf13363f31a3360411b43fe8e84af1634.cache
Loading model cost 1.414 seconds.
Prefix dict has been built succesfully.


current document index:0
current document index:2000
current document index:4000


In [8]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
encoded_docs = t.texts_to_sequences(docs)
# pad documents to a max length of 4 words
max_length = 200
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
# load pretrained model memory too big!
# pretrain_model = Word2Vec.load("data/model/Word2Vec_v1.4/w2v.model.bin")

In [44]:
model = construct_network_model(vocab_size, max_length)
model.summary()
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 200, 100)          11616500  
_________________________________________________________________
flatten_5 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 8)                 160008    
Total params: 11,776,508
Trainable params: 11,776,508
Non-trainable params: 0
_________________________________________________________________


In [45]:
model.fit(padded_docs, labels, epochs=10, validation_split=0.2)

Train on 4800 samples, validate on 1200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fabef1fd208>

In [94]:
test_docs = preprocess_raw_docs(X_test[:10])
test_docs = [" ".join(doc) for doc in test_docs]
test_docs = pad_sequences(t.texts_to_sequences(test_docs), maxlen=max_length, padding='post')

Building prefix dict from /home/mark/Documents/python/nlp-experiment/data/jieba_dict/dict.txt.big ...
Loading model from cache /tmp/jieba.uf13363f31a3360411b43fe8e84af1634.cache
Loading model cost 1.328 seconds.
Prefix dict has been built succesfully.


current document index:0


In [108]:
model.predict_classes(test_docs)



array([0, 5, 0, 6, 4, 0, 3, 3, 2, 5])

In [109]:
y_test[:10]

array([0, 5, 0, 6, 4, 0, 3, 3, 4, 5])