引入依賴套件
----------

In [152]:
import json
import collections

import keras
import jieba

import numpy as np
import pandas as pd

In [153]:
data_df = pd.read_csv('./data.csv', encoding = 'utf-8')
data_df.head() #取得前五名資料並輸出

Unnamed: 0,摘要,標註
0,中國福建省武平縣13日因連日強降雨，導致該縣平川街道東門市場旁的山坡發生山崩，多輛停在路旁的...,土石流
1,中國南方多省自6月上旬以來持續暴雨，各地紛紛出現洪水，統計至今已釀61死、14人失蹤、531...,水災
2,日本南部鹿兒島屋久島鎮，昨天一個下午降雨420毫米雨量，打破當地50年來最大單日降雨量，由於...,大雨
3,日本沖繩縣與那國島距離宜蘭僅有111公里，今日上午遭遇了一場暴雨，由於降雨量極大，日本氣象廳...,大雨
4,日本沖繩縣與那國島距離宜蘭僅有111公里，今日上午遭遇了一場暴雨，由於降雨量極大，日本氣象廳...,大雨


In [154]:
def segmentation(sentence: str) -> list:
    return jieba.lcut(sentence)

將摘要進行中文斷詞
---------------------

In [155]:
data_df['segmentation'] = data_df['摘要'].apply(segmentation)
data_df.head()

Unnamed: 0,摘要,標註,segmentation
0,中國福建省武平縣13日因連日強降雨，導致該縣平川街道東門市場旁的山坡發生山崩，多輛停在路旁的...,土石流,"[中國, 福建省, 武平, 縣, 13, 日因, 連日強, 降雨, ，, 導致, 該, 縣,..."
1,中國南方多省自6月上旬以來持續暴雨，各地紛紛出現洪水，統計至今已釀61死、14人失蹤、531...,水災,"[中國, 南方, 多省, 自, 6, 月, 上旬, 以來, 持續, 暴雨, ，, 各地, 紛..."
2,日本南部鹿兒島屋久島鎮，昨天一個下午降雨420毫米雨量，打破當地50年來最大單日降雨量，由於...,大雨,"[日本, 南部, 鹿兒島, 屋久, 島鎮, ，, 昨天, 一個, 下午, 降雨, 420, ..."
3,日本沖繩縣與那國島距離宜蘭僅有111公里，今日上午遭遇了一場暴雨，由於降雨量極大，日本氣象廳...,大雨,"[日本, 沖, 繩縣, 與, 那國島, 距離, 宜蘭僅, 有, 111, 公里, ，, 今日..."
4,日本沖繩縣與那國島距離宜蘭僅有111公里，今日上午遭遇了一場暴雨，由於降雨量極大，日本氣象廳...,大雨,"[日本, 沖, 繩縣, 與, 那國島, 距離, 宜蘭僅, 有, 111, 公里, ，, 今日..."


將斷詞結果整合成陣列，並計算最大長度的詞句
-----------------------------------

In [156]:
import sys

def get_all_vocab_from_data(data):
    """
        將字詞全部整合成一個陣列(array)
        :param data: 字詞陣列(二維)
    """
    train_vocab_list = []
    min_cut_query_length = sys.maxsize
    max_cut_query_length = 0
    for cut_query in data:
        query_length = len(cut_query)
        max_cut_query_length = max(max_cut_query_length, query_length)
        min_cut_query_length = min(min_cut_query_length, query_length)
        train_vocab_list += cut_query
    return train_vocab_list, max_cut_query_length, min_cut_query_length

In [157]:
train_vocab_list, max_cut_query_length, min_cut_query_length = get_all_vocab_from_data(data_df['segmentation'])

In [158]:
print('最長斷詞長度: {}'.format(max_cut_query_length))
print('最短斷詞長度: {}'.format(min_cut_query_length))
print('斷詞結果: {} 詞 (重複)'.format(len(train_vocab_list)))
print('斷詞結果: {} 詞 (不重複)'.format(len(set(train_vocab_list))))

最長斷詞長度: 178
最短斷詞長度: 16
斷詞結果: 4652 詞 (重複)
斷詞結果: 1178 詞 (不重複)


In [159]:
train_vocab_counter = collections.Counter(train_vocab_list)
train_vocab_counter.most_common(5)

[('，', 395), ('。', 118), ('（', 100), ('）', 100), ('地震', 86)]

In [160]:
label_counter = collections.Counter(data_df['標註'])
label_counter.most_common() #最常用的前五名

[('地震', 42), ('土石流', 4), ('大雨', 3), ('水災', 2)]

統計沒有用途的詞（低使用率）
-------------

In [161]:
def statistic_zero_usage_word(counter: collections.Counter) -> int:
    """
        統計計數器中沒有使用到的詞(count==1)
        :param counter: 計數器
    """
    counter = 0
    for word, times in train_vocab_counter.items():
        if times <=1:
            counter+=1
    return counter
train_data_times_zero = statistic_zero_usage_word(train_vocab_counter)
print('只使用到一次的詞: {}種'.format(train_data_times_zero))
print('佔全部 {} %'.format(train_data_times_zero/len(train_vocab_counter)*100))

只使用到一次的詞: 646種
佔全部 54.83870967741935 %


讀取和寫入JSON file的函數
-----------

In [162]:
def read_from_json(path: str) -> dict:
    """
        將.JSON的檔案載入成字典
        :param path: 檔案路徑 如./data.json
    """
    with open(path, encoding='utf-8') as f:
        return json.load(f)
    
def write_to_json(path: str, data: dict):
    """
        將字典寫入成.JSON檔
        :param path: 檔案路徑 如./data.json
        :param data: 資料字典 {key: value}
    """
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False)

轉換Counter成字典並儲存成data_count.json
--------

In [163]:
write_to_json('./data_count.json', dict(train_vocab_counter))

補充：讀取data_count.json後再轉換回Counter
------

In [164]:
data = read_from_json('./data_count.json')
counter = collections.Counter(data)
counter.most_common(5)

[('，', 395), ('。', 118), ('（', 100), ('）', 100), ('地震', 86)]

建立分詞索引表並儲存到word_index.json
--------

In [165]:
def counter_to_key_index_table(counter: collections.Counter) -> dict:
    """
        將輸入的Counter中的資料轉化成key, index
        :param counter: 
    """
    index = 0
    result = {}
    for key, value in counter.most_common(): #這樣的做法能讓最後建出的表依照使用順序排序(越常使用越前面)
        result[key] = index
        index+=1
    return result
word_to_index = counter_to_key_index_table(train_vocab_counter)
write_to_json('word_index.json', word_to_index) #儲存到word_index.json
print('分詞索引表長度: {}'.format(len(word_to_index)))
visualize_df = pd.DataFrame.from_dict(word_to_index.items())
visualize_df.head() #視覺化查看資料

分詞索引表長度: 1178


Unnamed: 0,0,1
0,，,0
1,。,1
2,（,2
3,）,3
4,地震,4


建立標籤索引表並儲存到label_index.json
--------

In [166]:
label_to_index = counter_to_key_index_table(label_counter)
write_to_json('label_index.json', label_to_index)
print('標籤索引表長度 {}'.format(len(label_to_index)))

標籤索引表長度 4


補充：將word_index.json讀入後翻轉成index: word的形式
----------

In [167]:
word_to_index = read_from_json('./word_index.json')
index_to_word = {index: word for word, index in word_to_index.items()}
visualize_df = pd.DataFrame.from_dict(index_to_word.items())
visualize_df.head() #視覺化查看資料
#ref: https://stackoverflow.com/questions/483666/python-reverse-invert-a-mapping

Unnamed: 0,0,1
0,0,，
1,1,。
2,2,（
3,3,）
4,4,地震


將詞句轉向量
---

In [168]:
def sentence_vetorize(sentences: [str], labels: [str], word_to_index: dict, label_to_index: dict, max_length: int) -> (np.ndarray, np.ndarray):
    """
        將詞句轉為向量(numpy陣列)
        :param sentences: 經過斷詞的詞句array
        :param labels: 對應詞句的標籤
        :param word_to_index: 單字的索引表
        :param label_to_index: 標籤的索引表
        :max_length: 文字產出的最大長度, 如為100 可斷詞結果長度為176 後面76個則會被捨棄
    """
    x_data = []
    y_label = []
    for sentence, label in zip(sentences, labels):
        x = [word_to_index.get(vacob, 0) for vacob in sentence]
        y = [label_to_index.get(label, 0)]
        x_data.append(x)
        y_label.append(y)
    return (keras.preprocessing.sequence.pad_sequences(x_data, maxlen=max_length),
            keras.preprocessing.sequence.pad_sequences(y_label, maxlen=1))
    

In [169]:
x_train, y_train = sentence_vetorize(data_df['segmentation'], data_df['標註'], word_to_index, label_to_index, 100)
x_train.shape, y_train.shape
visualize_df = pd.DataFrame(x_train)
visualize_df.head() #視覺化查看資料

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,...,555,5,232,556,115,117,176,82,342,1
1,147,562,119,0,563,179,564,565,344,11,...,600,354,2,83,238,601,602,354,3,1
2,0,0,0,0,0,0,0,0,0,43,...,241,31,119,242,101,146,0,633,634,1
3,0,0,0,0,0,0,0,0,0,0,...,376,377,245,378,379,181,5,380,239,1
4,0,0,0,0,0,0,0,0,0,0,...,376,377,245,378,379,181,5,380,239,1


將標籤使用one-hot-encoding編碼
----

In [170]:
y_train = keras.utils.to_categorical(y_train, len(label_to_index))
print('shape {}'.format(y_train.shape))
visualize_df = pd.DataFrame(y_train)
visualize_df.head() #視覺化查看資料

shape (51, 4)


Unnamed: 0,0,1,2,3
0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0


存成.npz檔 （下次就直接load成numpy陣列不必再處理）
------

In [173]:
np.savez('training_data', x_train=x_train, y_train=y_train)

載入.npz檔
---------

In [177]:
with np.load('./training_data.npz') as f:
    x_train, y_train = f['x_train'], f['y_train']

In [178]:
visualize_df = pd.DataFrame(x_train)
visualize_df.head() #視覺化查看資料

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,...,555,5,232,556,115,117,176,82,342,1
1,147,562,119,0,563,179,564,565,344,11,...,600,354,2,83,238,601,602,354,3,1
2,0,0,0,0,0,0,0,0,0,43,...,241,31,119,242,101,146,0,633,634,1
3,0,0,0,0,0,0,0,0,0,0,...,376,377,245,378,379,181,5,380,239,1
4,0,0,0,0,0,0,0,0,0,0,...,376,377,245,378,379,181,5,380,239,1


In [179]:
visualize_df = pd.DataFrame(y_train)
visualize_df.head() #視覺化查看資料

Unnamed: 0,0,1,2,3
0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
