引入依賴套件
----------

In [9]:
import json
import collections

import keras
import jieba

import numpy as np
import pandas as pd

Using TensorFlow backend.


In [10]:
data_df = pd.read_csv('./data.csv', encoding = 'utf-8')
data_df.head() #取得前五名資料並輸出

Unnamed: 0,摘要,標註
0,中國福建省武平縣13日因連日強降雨，導致該縣平川街道東門市場旁的山坡發生山崩，多輛停在路旁的...,土石流
1,中國南方多省自6月上旬以來持續暴雨，各地紛紛出現洪水，統計至今已釀61死、14人失蹤、531...,水災
2,日本南部鹿兒島屋久島鎮，昨天一個下午降雨420毫米雨量，打破當地50年來最大單日降雨量，由於...,大雨
3,日本沖繩縣與那國島距離宜蘭僅有111公里，今日上午遭遇了一場暴雨，由於降雨量極大，日本氣象廳...,大雨
4,日本沖繩縣與那國島距離宜蘭僅有111公里，今日上午遭遇了一場暴雨，由於降雨量極大，日本氣象廳...,大雨


In [14]:
def segmentation(sentence: str) -> list:
    return jieba.lcut(sentence)

def jieba_tokenizer(sentence: str) -> str:
    return ' '.join(jieba.cut(sentence))

將摘要進行中文斷詞
---------------------

In [15]:
data_df['tokenized'] = data_df['摘要'].apply(jieba_tokenizer)
data_df.head()

Unnamed: 0,摘要,標註,segmentation,tokenized
0,中國福建省武平縣13日因連日強降雨，導致該縣平川街道東門市場旁的山坡發生山崩，多輛停在路旁的...,土石流,中國福建省武平縣13日因連日強降雨，導致該縣平川街道東門市場旁的山坡發生山崩，多輛停在路旁的...,中國 福建省 武平 縣 13 日因 連日強 降雨 ， 導致 該 縣 平川 街道 東門 市場 ...
1,中國南方多省自6月上旬以來持續暴雨，各地紛紛出現洪水，統計至今已釀61死、14人失蹤、531...,水災,中國南方多省自6月上旬以來持續暴雨，各地紛紛出現洪水，統計至今已釀61死、14人失蹤、531...,中國 南方 多省 自 6 月 上旬 以來 持續 暴雨 ， 各地 紛紛 出現 洪水 ， 統計 ...
2,日本南部鹿兒島屋久島鎮，昨天一個下午降雨420毫米雨量，打破當地50年來最大單日降雨量，由於...,大雨,日本南部鹿兒島屋久島鎮，昨天一個下午降雨420毫米雨量，打破當地50年來最大單日降雨量，由於...,日本 南部 鹿兒島 屋久 島鎮 ， 昨天 一個 下午 降雨 420 毫米 雨量 ， 打破 當...
3,日本沖繩縣與那國島距離宜蘭僅有111公里，今日上午遭遇了一場暴雨，由於降雨量極大，日本氣象廳...,大雨,日本沖繩縣與那國島距離宜蘭僅有111公里，今日上午遭遇了一場暴雨，由於降雨量極大，日本氣象廳...,日本 沖 繩縣 與 那國島 距離 宜蘭僅 有 111 公里 ， 今日 上午 遭遇 了 一場 ...
4,日本沖繩縣與那國島距離宜蘭僅有111公里，今日上午遭遇了一場暴雨，由於降雨量極大，日本氣象廳...,大雨,日本沖繩縣與那國島距離宜蘭僅有111公里，今日上午遭遇了一場暴雨，由於降雨量極大，日本氣象廳...,日本 沖 繩縣 與 那國島 距離 宜蘭僅 有 111 公里 ， 今日 上午 遭遇 了 一場 ...


讀取和寫入JSON file的函數
-----------

In [20]:
def read_from_json(path: str) -> dict:
    """
        將.JSON的檔案載入成字典
        :param path: 檔案路徑 如./data.json
    """
    with open(path, encoding='utf-8') as f:
        return json.load(f)
    
def write_to_json(path: str, data: dict):
    """
        將字典寫入成.JSON檔
        :param path: 檔案路徑 如./data.json
        :param data: 資料字典 {key: value}
    """
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False)

透過Keras Tokenizer進行資料預處理
-----------------------------------------

In [65]:
max_number_of_words = 10000
filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n，。！：（＿）你我他的'
tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_number_of_words,
                                               filters=filters,
                                               lower=True,
                                               split=" ",
                                               char_level=False)
tokenizer.fit_on_texts(data_df['tokenized'])

將文字斷詞統計表儲存成data_count.json
--------

In [66]:
write_to_json('./data_count.json', tokenizer.word_counts)

補充：讀取data_count.json後再轉換回Counter
------

In [67]:
data = read_from_json('./data_count.json')
counter = collections.Counter(data)
counter.most_common(5)

[('地震', 86), ('規模', 76), ('發生', 69), ('公里', 66), ('5', 53)]

建立分詞索引表並儲存到word_index.json
--------

In [68]:
write_to_json('./word_index.json', tokenizer.word_index)

word_to_index = read_from_json('./word_index.json')
print('分詞索引表長度: {}'.format(len(word_to_index)))
visualize_df = pd.DataFrame.from_dict(word_to_index.items())
visualize_df.head() #視覺化查看資料

分詞索引表長度: 1133


Unnamed: 0,0,1
0,地震,1
1,規模,2
2,發生,3
3,公里,4
4,5,5


建立標籤索引表並儲存到label_index.json
--------

In [91]:
label_tokenizer = keras.preprocessing.text.Tokenizer(num_words=None,
                                               filters=filters,
                                               lower=True,
                                               split=" ",
                                               char_level=False)
label_tokenizer.fit_on_texts(data_df['標註'])
label_to_index = label_tokenizer.word_index
# label_to_index = counter_to_key_index_table(label_counter)
write_to_json('label_index.json', label_to_index)
print('標籤索引表長度 {}'.format(len(label_to_index)))

標籤索引表長度 4


補充：將word_index.json讀入後翻轉成index: word的形式
----------

In [70]:
word_to_index = read_from_json('./word_index.json')
index_to_word = {index: word for word, index in word_to_index.items()}
visualize_df = pd.DataFrame.from_dict(index_to_word.items())
visualize_df.head() #視覺化查看資料
#ref: https://stackoverflow.com/questions/483666/python-reverse-invert-a-mapping

Unnamed: 0,0,1
0,1,地震
1,2,規模
2,3,發生
3,4,公里
4,5,5


將詞句轉向量
---

In [100]:
max_length_of_sequences = 100
x_sequences = tokenizer.texts_to_sequences(data_df['tokenized'])
# 透過pad sequences會把不足max_length_of_sequences的詞句自動補0 (補前面)
x_train = keras.preprocessing.sequence.pad_sequences(x_sequences, maxlen=max_length_of_sequences) 

In [101]:
visualize_df = pd.DataFrame(x_train)
visualize_df.head() #視覺化查看資料

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,...,534,535,536,219,537,111,113,166,81,323
1,6,167,540,324,541,220,542,139,543,115,...,579,580,5,335,82,225,581,582,33,335
2,0,0,0,0,0,0,0,0,0,0,...,72,340,228,34,115,229,97,138,613,614
3,0,0,0,0,0,0,0,0,0,0,...,356,6,357,358,232,359,360,170,361,226
4,0,0,0,0,0,0,0,0,0,0,...,356,6,357,358,232,359,360,170,361,226


將標籤使用one-hot-encoding編碼
----

In [103]:
# 透過tokenizer的話，會預設留0做特殊用途
label_matrix = label_tokenizer.texts_to_matrix(data_df['標註'])
print('shape {}'.format(label_matrix.shape))
visualize_df = pd.DataFrame(label_matrix)
visualize_df.head() #視覺化查看資料

shape (51, 5)


Unnamed: 0,0,1,2,3,4
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0


存成.npz檔 （下次就直接load成numpy陣列不必再處理）
------

In [104]:
np.savez('training_data', x_train=x_train, y_train=label_matrix)

載入.npz檔
---------

In [105]:
with np.load('./training_data.npz') as f:
    x_train, y_train = f['x_train'], f['y_train']

In [106]:
visualize_df = pd.DataFrame(x_train)
visualize_df.head() #視覺化查看資料

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,...,534,535,536,219,537,111,113,166,81,323
1,6,167,540,324,541,220,542,139,543,115,...,579,580,5,335,82,225,581,582,33,335
2,0,0,0,0,0,0,0,0,0,0,...,72,340,228,34,115,229,97,138,613,614
3,0,0,0,0,0,0,0,0,0,0,...,356,6,357,358,232,359,360,170,361,226
4,0,0,0,0,0,0,0,0,0,0,...,356,6,357,358,232,359,360,170,361,226


In [107]:
visualize_df = pd.DataFrame(y_train)
visualize_df.head() #視覺化查看資料

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0
