In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# RNN を使ったテキスト分類

In [2]:
# !pip install -q tf-nightly
# import tensorflow_datasets as tfds
# !pip3 list | grep tensorflow
# import tensorflow as tf

# !pip3 install sentencepiece
import sentencepiece as spm

import json
import os
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# GPUを使うために必要
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# import matplotlib.pyplot as plt

# def plot_graphs(history, metric):
#   plt.plot(history.history[metric])
#   plt.plot(history.history['val_'+metric], '')
#   plt.xlabel("Epochs")
#   plt.ylabel(metric)
#   plt.legend([metric, 'val_'+metric])
#   plt.show()

## Data


### load

In [4]:
sbj_names = [
#     ['Eigo']
#     ['Suugaku']
    ['Eigo', 'Suugaku']
][0]
elmnt_name = 'question'
attr_name = 'knowledge_type'  # 'answer_type'   #

df = None
for sbj in sbj_names:
    print(sbj)
    attr_csv_path = f'../{sbj}_{attr_name}_ds.tsv'
    df_tmp = pd.read_csv(attr_csv_path, delimiter='\t')
    df = pd.concat([df, df_tmp])

df = df.reset_index(drop=True)
df = df.dropna()  # nan を削除
df

Eigo
Suugaku


Unnamed: 0.1,Unnamed: 0,knowledge_type,<instruction/>,contents
0,0,PRN,次の問い(問１～５)のそれぞれの単語①～④のうちから，アクセント(第一強勢)のある母音の発音...,"<label>問１</label> <ansColumn id=""A1"">1</ansC..."
1,1,PRN,次の問い(問１～５)のそれぞれの単語①～④のうちから，アクセント(第一強勢)のある母音の発音...,"<label>問２</label> <ansColumn id=""A2"">2</ansC..."
2,2,PRN,次の問い(問１～５)のそれぞれの単語①～④のうちから，アクセント(第一強勢)のある母音の発音...,"<label>問３</label> <ansColumn id=""A3"">3</ansC..."
3,3,PRN,次の問い(問１～５)のそれぞれの単語①～④のうちから，アクセント(第一強勢)のある母音の発音...,"<label>問４</label> <ansColumn id=""A4"">4</ansC..."
4,4,PRN,次の問い(問１～５)のそれぞれの単語①～④のうちから，アクセント(第一強勢)のある母音の発音...,"<label>問５</label> <ansColumn id=""A5"">5</ansC..."
...,...,...,...,...
3028,393,MATH_IIB_VECTOR,三つのベクトル，，について …………………………① …………………………②ア，イに当てはまる...,<label>(2)</label> <instruction><formula />...
3029,394,MATH_IIB_VECTOR,により，三角形ABCは正三角形である。以下，4点A，B，C，Dが，正四面体の四つの頂点になる...,<label>(3)</label> <instruction><formula>(x...
3030,395,"IC_O,IC_T,MATH_IIB_STATISTICS",0＜p＜1とする。袋の中に白球がp，赤球が1-pの割合で，全部でm個入っているものとする...,<label>(1)</label> <instruction><formula />...
3031,396,"IC_O,IC_T,MATH_IIB_STATISTICS",とする。この袋の中から1個の球を取り出し袋の中へ戻すという試行を4回繰り返すとき，白球の出る...,<label>(2)</label> <instruction><formula>m=...


### Tokenize
SentencePiece を使用。
- タグ あり／なし

In [5]:
from sklearn.model_selection import train_test_split

train_examples, test_examples = train_test_split(
                                    df, test_size=0.4, random_state=0)
train_examples.head(5)

Unnamed: 0.1,Unnamed: 0,knowledge_type,<instruction/>,contents
270,270,R_QA,１～４)の空欄( 42 ～ 45 )に入れるのに最も適当なものを，それぞれ以下の①～④のうち...,"<label>問１</label><data id=""D29"" type=""text"">Wh..."
2351,2351,"DIS_W,R_ENT",次の文章を読み，下の問い（Ａ・Ｂ）に答えよ。なお，文章の左にある(1)～(6)は段落の番...,"<label>問５</label><data id=""D48"" type=""text""> T..."
1431,1431,DIS_S,次の問い(問１・問２)において，文章の 28 ・ 29 に入れる三つの文が，順不同で以下のA...,"<label>問１</label> <data id=""D22"" type=""text..."
475,475,"DIS_S, IC_O",(配点 16),"<label>問３</label><ansColumn id=""A44"">44</ansCo..."
532,532,"DIS_S, R_ENT",(配点 24),"<label>問３</label><data id=""D43"" type=""text"">Th..."


In [6]:
df_tmp = df[['<instruction/>', 'contents']]
df_tmp['<instruction/>'][6]

'次の一連の文章(問１・２)の中の①～③および④～⑥には，それぞれ強く発音されるべき語が一つずつある。その語を選べ。'

In [7]:
m_dir = 'model/SentencePiece'
os.makedirs(m_dir, exist_ok=True)
df.to_csv(f'{m_dir}/tmp.txt', sep='\t')

# arg_str = '--input={m_dir}/tmp.txt --model_prefix={m_dir}/m_user ' + '--user_defined_symbols=<sep>,<cls>' + ',<ansColumn/>,<label>' + ' --vocab_size=2000'
# spm.SentencePieceTrainer.train(arg_str)

spm.SentencePieceTrainer.train(f'--input={m_dir}/tmp.txt --model_prefix={m_dir}/m  --user_defined_symbols=<sep>,<cls>,<pad>   --vocab_size=2000')
sp = spm.SentencePieceProcessor()  # model_file='SentencePiece/test_model.model'

sp.load(f'{m_dir}/m.model')

True

In [8]:
# encode: text => id
tokenized_tokens =  sp.encode_as_pieces('次の問い(問１～３)の会話の 17 ～ 19 に入れるのに最も適当なものを，それぞれ以下の①～④のうちから一つずつ選べ。	')
print(tokenized_tokens)

tokenized_ids = sp.encode_as_ids('次の問い(問１～３)の会話の 17 ～ 19 に入れるのに最も適当なものを，それぞれ以下の①～④のうちから一つずつ選べ。	')
print(tokenized_ids)

decoded_text = sp.decode(tokenized_ids)
print(decoded_text)

['▁次の問い', '(', '問', '1～3)', 'の会話の', '▁17', '▁～', '▁19', '▁に入れるのに最も適当な', 'ものを', ',', 'それぞれ以下の', '1～4', 'のうちから一つずつ選', 'べ', '。']
[70, 32, 29, 188, 609, 197, 79, 218, 169, 64, 20, 93, 59, 66, 58, 35]
次の問い(問1～3)の会話の 17 ～ 19 に入れるのに最も適当なものを,それぞれ以下の1～4のうちから一つずつ選べ。


In [9]:
example_content = df_tmp['contents'][20]
print(example_content, sp.encode_as_pieces(example_content))

  <label>問14</label>  <data id="D20" type="text"> <ansColumn id="A21">21</ansColumn></data>  <choices anscol="A21" comment=""> <choice><cNum>①</cNum> Yes, thank you.  </choice> <choice><cNum>②</cNum> Yes, that's O.K.  </choice> <choice><cNum>③</cNum> Not at all.  </choice> <choice><cNum>④</cNum> I'd be glad to.  </choice> </choices> ['▁<', 'label', '>', '問', '14', '</', 'label', '>', '▁<', 'data', '▁id', '=', '"', 'D', '20', '"', '▁type', '=', '"', 'text', '"', '>', '▁<', 'ansColumn', '▁id', '=', '"', 'A', '21', '"', '>21</', 'ansColumn', '></', 'data', '>', '▁<', 'choices', '▁ans', 'col', '=', '"', 'A', '21', '"', '▁comment', '=""', '>', '▁<', 'choice', '><', 'c', 'Num', '>1</', 'c', 'Num', '>', '▁Yes', ',', '▁than', 'k', '▁you', '.', '▁</', 'choice', '>', '▁<', 'choice', '><', 'c', 'Num', '>2</', 'c', 'Num', '>', '▁Yes', ',', '▁that', "'", 's', '▁', 'O', '.', 'K', '.', '▁</', 'choice', '>', '▁<', 'choice', '><', 'c', 'Num', '>3</', 'c', 'Num', '>', '▁No', 't', '▁at', '▁all', '.', '▁<

In [10]:
# for index in encoded_string:
#   print('{} ----> {}'.format(index, encoder.decode([index])))

## Train 用データの準備

In [11]:
word2index = {}
# 系列を揃えるためのパディング文字列<pad>を追加
# パディング文字列のIDは0とする
word2index.update({"<pad>":0})

for inst, cont in zip(df['<instruction/>'], df['contents']):
#     try:
    tokens = sp.encode_as_pieces(inst + cont)
    for word in tokens:
            if word in word2index: continue
            word2index[word] = len(word2index)
#     except TypeError:
#         print(f'[Error] <instruction/> が nan です。')
#         print(f'    inst : {inst}')
#         print(f'    cont : {cont}')

print("vocab size : ", len(word2index))


vocab size :  2163


In [12]:
## set_dict から自動抽出する！
# attr_name = 'knowledge_type'  # 'answer_type'

categories = set()
for sbj in sbj_names:
    with open(f'../class_set/{sbj}-{elmnt_name}-{attr_name}.json') as f:
        categories |= set(json.load(f))   # sbj_names = ['Eigo', ]

# print(categories)

# categories = [
#     'sentence', 
#     'term_person', 'term_location', 'term_time', 'term_other',
#     'referenceSymbol',
#     'image_graph', 'image_photo', 'image_map', 'image_table', 'image_other',
#     'formula', 
#     'orthography',
#     'other',
#     # 組み合わせ系（仮追加）
#     '(symbol-sentence)*2', '(symbol-sentence)*3', '(symbol-sentence)*4', '(symbol-term_location)*3', '(symbol-term_other)*3', '(symbol-term_other)*3',
#     '(symbol-symbol)*4',
#     '(term_location-term_location-term_location)', 'term_location-term_location-term_location-term_location',
#     '(term_location-term_location-term_location)', '(term_location-term_location-term_location)',
#     '(term_other-term_other-term_other)',
#     'term_other-term_other-term_other',
#     'sentence-sentence',
#     'sentence-sentence-sentence',
#     'symbol-symbol',
#     'symbol-symbol-symbol',
#     'symbol-symbol-symbol-symbol',
#     'o(symbol-symbol-symbol-symbol)',
#     'o(symbol-symbol-symbol)',
# ]

categories = list(categories)
print(categories)
print(len(categories))

['R_ENT,IC_G,IC_P', 'DIS_W, R_ENT, R_SUM, IC_O', 'DIS_O', 'SEL, EG', 'SEL, DIS_W', 'GK,Other', 'R_QA, R_ENT, R_SUM', 'DIS_W,R_QA', 'EG, ', 'DIS_W, R_ENT, IC_T', 'IC_O,MATH_IIB_VECTOR', 'Other,IC_T', 'DIS_C, GK', 'MATH_IIB_VECTOR', 'DIC_O,GK', 'DIC_O, EG', 'IDM, SEL, DIS_W', 'IDM,GK', 'DIS_W, R_ENT, IC_G', 'EG, GK', 'DIS_O, DIS_W, GK', 'DIC_O, DIS_W, GK', 'R_QA, IC_P', 'SEL,EG,DIS_W', 'MATH_IA_SET_LOGIC,IC_G,IC_T', 'MATH_IA_SET_LOGIC,IC_G', 'IDM, SEL, DIS_W, GK', 'IDM, EG, DIS_W, GK', 'DIS_W,GK', 'R_QA, IC_O', 'DIC_O, DIS_S, R_ENT', 'R_ENT,DIS_W', 'DIS_S, R_ENT, IC_M', 'MATH_IIB_TRIGONOMETRY', 'MATH_IA_SET_LOGIC,IC_G,IC_O', 'IDM, EG, DIS_S', 'DIS_S, R_ENT, R_SUM', 'FOC_1', 'DIS_S, R_ENT, IC_O', 'DIS_W, R_ENT', 'IDM, EG, GK', 'R_SUM,IC_T', 'DIS_S', 'IDM', 'DIS_W, R_QA', 'DIS_S,R_QA', 'MATH_IIB_STATISTICS', 'MATH_IA_PROBABILITY,IC_O', 'R_ENT,IC_P', 'EG, DIS_W, GK', 'DIS_W,R_ENT', 'R_QA,IC_O', 'DIS_O, Other', 'MATH_IA_GEOMETRY', 'DIS_S, IC_O', 'DIS_S, R_ENT, IC_T', 'Other', 'IDM,SEL,EG,DIS

In [13]:
## 系列の長さを揃えてバッチでまとめる
from sklearn.model_selection import train_test_split
import random
from sklearn.utils import shuffle

cat2index = {}
for cat in categories:
    if cat in cat2index: continue
    cat2index[cat] = len(cat2index)

def sentence2index(sentence):
    tokens = sp.encode_as_pieces(sentence)
    # print(tokens)
    return [word2index[w] for w in tokens]

def category2index(cat):
    return [cat2index[cat]]

index_datasets_c_xml_tmp = []
index_datasets_category = []

# 系列の長さの最大値を取得。この長さに他の系列の長さをあわせる
max_len = 0
for inst, cont, category in zip(df['<instruction/>'], df['contents'], df[attr_name]):
    index_c_xml = sentence2index(inst + cont)
    index_category = category2index(category)
    index_datasets_c_xml_tmp.append(index_c_xml)
    index_datasets_category.append(index_category)
    if max_len < len(index_c_xml):
        max_len = len(index_c_xml)

# 系列の長さを揃えるために短い系列にパディングを追加
# 後ろパディングだと正しく学習できなかったので、前パディング
index_datasets_c_xml = []
for c_xml in index_datasets_c_xml_tmp:
    for i in range(max_len - len(c_xml)):
        c_xml.insert(0, 0) # 前パディング
#     c_xml.append(0)　# 後ろパディング
    index_datasets_c_xml.append(c_xml)

train_x, test_x, train_y, test_y = train_test_split(index_datasets_c_xml, index_datasets_category, train_size=0.7)

# データをバッチでまとめるための関数
def train2batch(c_xml, category, batch_size=100):
    c_xml_batch = []
    category_batch = []
    c_xml_shuffle, category_shuffle = shuffle(c_xml, category)
    for i in range(0, len(c_xml), batch_size):
        c_xml_batch.append(c_xml_shuffle[i:i+batch_size])
        category_batch.append(category_shuffle[i:i+batch_size])
    return c_xml_batch, category_batch

## Model

In [14]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        # <pad>の単語IDが0なので、padding_idx=0としている
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # batch_first=Trueが大事！
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.softmax = nn.LogSoftmax()

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        #embeds.size() = (batch_size × len(sentence) × embedding_dim)
        _, lstm_out = self.lstm(embeds)
        # lstm_out[0].size() = (1 × batch_size × hidden_dim)
        tag_space = self.hidden2tag(lstm_out[0])
        # tag_space.size() = (1 × batch_size × tagset_size)

        # (batch_size × tagset_size)にするためにsqueeze()する
        tag_scores = self.softmax(tag_space.squeeze())
        # tag_scores.size() = (batch_size × tagset_size)

        return tag_scores  

    def load_weights(self, load_m_path='_logs/test/LSTM_classifier.pth',):
        if load_m_path is not None:
            param = torch.load(load_m_path)
            self.load_state_dict(param)
            print(f'[info] {load_m_path} loaded !')

    def save(self, save_f_path='_logs/test/LSTM_classifier.pth',):
        torch.save(self.state_dict(), save_f_path)

    
# 単語の埋め込み次元数上げた。精度がそこそこアップ！ハイパーパラメータのチューニング大事。
EMBEDDING_DIM = 200
HIDDEN_DIM = 128
VOCAB_SIZE = len(word2index)
TAG_SIZE = len(categories)

model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE).to(device)

loss_function = nn.NLLLoss()
# SGDからAdamに変更。特に意味はなし
optimizer = optim.Adam(model.parameters(), lr=0.001)

## モデルの保存場所を準備する。
import datetime
dt_now = datetime.datetime.now()
save_m_dir = os.path.join('_logs', dt_now.strftime('%Y-%m-%d_%Hh%Mm%Ss'))
save_m_path = os.path.join(save_m_dir, 'LSTM_classifier_.pth')
os.makedirs(save_m_dir, exist_ok=True)

## Experiment Train

In [None]:
min_loss = 1.0
losses = []
for epoch in range(1000):
    all_loss = 0
    title_batch, category_batch = train2batch(train_x, train_y)
    for i in range(len(title_batch)):
        batch_loss = 0

        model.zero_grad()

        # 順伝搬させるtensorはGPUで処理させるためdevice=にGPUをセット
        title_tensor = torch.tensor(title_batch[i], device=device)
        # category_tensor.size() = (batch_size × 1)なので、squeeze()
        category_tensor = torch.tensor(category_batch[i], device=device).squeeze()

        out = model(title_tensor)

        batch_loss = loss_function(out, category_tensor)
        batch_loss.backward()
        optimizer.step()

        all_loss += batch_loss.item()
        if min_loss > all_loss:
            model.save(save_m_path)

    print("epoch", epoch, "\t" , "loss", all_loss)
    if all_loss < 0.01: break
print("done.")



epoch 0 	 loss 95.50269174575806
epoch 1 	 loss 81.25502109527588
epoch 2 	 loss 76.38460779190063
epoch 3 	 loss 67.69047808647156
epoch 4 	 loss 59.17237377166748
epoch 5 	 loss 53.95864295959473
epoch 6 	 loss 49.98704493045807
epoch 7 	 loss 46.19981861114502
epoch 8 	 loss 43.47021985054016
epoch 9 	 loss 40.71964740753174
epoch 10 	 loss 39.68160915374756
epoch 11 	 loss 37.06717872619629
epoch 12 	 loss 33.15812975168228
epoch 13 	 loss 30.99098777770996
epoch 14 	 loss 29.023929953575134
epoch 15 	 loss 27.11479413509369
epoch 16 	 loss 25.550770461559296
epoch 17 	 loss 24.223787665367126
epoch 18 	 loss 22.18769747018814
epoch 19 	 loss 20.248188853263855
epoch 20 	 loss 19.122029423713684
epoch 21 	 loss 17.104594588279724
epoch 22 	 loss 15.437900960445404
epoch 23 	 loss 14.511737018823624
epoch 24 	 loss 13.593076020479202
epoch 25 	 loss 12.165831297636032
epoch 28 	 loss 11.767198830842972
epoch 29 	 loss 10.157331198453903
epoch 30 	 loss 9.040478348731995
epoch 31 	 l

___

In [None]:
test_num = len(test_x)
a = 0
with torch.no_grad():
    title_batch, category_batch = train2batch(test_x, test_y)

    for i in range(len(title_batch)):
        title_tensor = torch.tensor(title_batch[i], device=device)
        category_tensor = torch.tensor(category_batch[i], device=device)

        out = model(title_tensor)
        _, predicts = torch.max(out, 1)
        for j, ans in enumerate(category_tensor):
            if predicts[j].item() == ans.item():
                a += 1
#             else:
#                 print(predicts[j].item(), ans.item())
print("predict : ", a / test_num)
# predict :  0.6967916854948034

___

## 2つ以上の LSTM レイヤーを重ねる

Keras のリカレントレイヤーには、コンストラクタの `return_sequences` 引数でコントロールされる2つのモードがあります。

* それぞれのタイムステップの連続した出力のシーケンス全体（shape が `(batch_size, timesteps, output_features)` の3階テンソル）を返す。
* それぞれの入力シーケンスの最後の出力だけ（shape が `(batch_size, output_features)` の2階テンソル）を返す。

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [None]:
# パディングなしのサンプルテキストの推論

sample_pred_text = ('The movie was not good. The animation and the graphics '
                    'were terrible. I would not recommend this movie.')
predictions = sample_predict(sample_pred_text, pad=False)
print(predictions)

In [None]:
# パディングありのサンプルテキストの推論

sample_pred_text = ('The movie was not good. The animation and the graphics '
                    'were terrible. I would not recommend this movie.')
predictions = sample_predict(sample_pred_text, pad=True)
print(predictions)

In [None]:
plot_graphs(history, 'accuracy')

In [None]:
plot_graphs(history, 'loss')

[GRU レイヤー](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GRU)など既存のほかのレイヤーを調べてみましょう。

カスタム RNN の構築に興味があるのであれば、[Keras RNN ガイド](../../guide/keras/rnn.ipynb) を参照してください。