In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [2]:
# 自分の Google Drive をマウント
# https://qiita.com/uni-3/items/201aaa2708260cc790b8#drive内のディレクトリをマウントする220180920
from google.colab import drive
import os
drive.mount('/content/drive')

#@markdown ## 【データを保存するGoogleDriveのパス】
#@markdown ### GitHubリポジトリをcloneするパス
REPOSITORY_PEARENT_DIR = '/content/drive/My Drive/__datasets__/[CV][NLP]\u300C\u30BB\u30F3\u30BF\u30FC\u8A66\u9A13xml\u300D/annotate_img/datas/attribute/Experiment' #@param {type: "string"}
os.chdir(REPOSITORY_PEARENT_DIR)

Mounted at /content/drive


# RNN を使ったテキスト分類

In [3]:
# !pip install -q tf-nightly
# import tensorflow_datasets as tfds
# !pip3 list | grep tensorflow
# import tensorflow as tf

!pip3 install sentencepiece
import sentencepiece as spm

import json
import os
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# GPUを使うために必要
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 28.5MB/s eta 0:00:01
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95


In [4]:
# import matplotlib.pyplot as plt

# def plot_graphs(history, metric):
#   plt.plot(history.history[metric])
#   plt.plot(history.history['val_'+metric], '')
#   plt.xlabel("Epochs")
#   plt.ylabel(metric)
#   plt.legend([metric, 'val_'+metric])
#   plt.show()

## Data


### load

In [5]:
sbj_names = 'Suugaku' #@param [['Eigo'], ['Suugaku'], ['Eigo', 'Suugaku']]
sbj_names = sbj_names.split(',')
print(sbj_names)
elmnt_name = 'question'
attr_name = 'knowledge_type'   #@param ['knowledge_type', 'answer_type']
is_remove_xml = False #@param {type:"boolean"}

df = None
for sbj in sbj_names:
    print(sbj)
    attr_csv_path = f'../{sbj}_{attr_name}_ds.tsv'
    df_tmp = pd.read_csv(attr_csv_path, delimiter='\t')
    df = pd.concat([df, df_tmp])

df = df.reset_index(drop=True)
df = df.dropna()  # nan を削除
df

['Suugaku']
Suugaku


Unnamed: 0.1,Unnamed: 0,knowledge_type,<instruction/>,contents
1,1,MATH_IA_EQ,不等式2⁢x+1≦32x13の解はアイ≦x≦ウアイxウである。 aaを自然数とする。,<label>(2)</label><instruction> 不等式 <lText id=...
2,2,MATH_IA_EQ,不等式2⁢x+1≦a2x1a① -エ-aオ≦x≦-エ+aオエaオxエaオである。,<label>(3)</label><instruction> 不等式<ref target...
3,3,MATH_IA_EQ,kkを定数とする。自然数mm，nnに関する条件pp，qq，rrを次のように定める。pp:m>...,<label>(1)</label><instruction> 次の<ref target=...
4,4,MATH_IA_EQ,次のクに当てはまるものを，下の⓪～➂のうちから一つ選べ。 ppの否定p¯pはクである。,<label>(2)</label><instruction> 次の<ref target=...
5,5,MATH_IA_PARABOLA,次のケ～サに当てはまるものを，下の⓪～➂のうちから一つずつ選べ。ただし，同じものを繰り返し...,<label>【２】</label><info>（配点　25）</info> <instr...
...,...,...,...,...
393,393,MATH_IIB_VECTOR,三つのベクトル，，について …………………………① …………………………②ア，イに当てはまる...,<label>(2)</label> <instruction><formula />...
394,394,MATH_IIB_VECTOR,により，三角形ABCは正三角形である。以下，4点A，B，C，Dが，正四面体の四つの頂点になる...,<label>(3)</label> <instruction><formula>(x...
395,395,"IC_O,IC_T,MATH_IIB_STATISTICS",0＜p＜1とする。袋の中に白球がp，赤球が1-pの割合で，全部でm個入っているものとする...,<label>(1)</label> <instruction><formula />...
396,396,"IC_O,IC_T,MATH_IIB_STATISTICS",とする。この袋の中から1個の球を取り出し袋の中へ戻すという試行を4回繰り返すとき，白球の出る...,<label>(2)</label> <instruction><formula>m=...


In [6]:
## XMLタグ除去
if is_remove_xml:
    import re
    def remove_xml(xml_child_str):
        xml_child_str = re.sub('<.*?>', '', xml_child_str)
        return re.sub('</.*?>', '', xml_child_str)

    # test_idx = 200
    # print(df['contents'][test_idx])
    # print(remove_xml(df['contents'][test_idx]))
    df['contents'] = df['contents'].map(remove_xml)
df

Unnamed: 0.1,Unnamed: 0,knowledge_type,<instruction/>,contents
1,1,MATH_IA_EQ,不等式2⁢x+1≦32x13の解はアイ≦x≦ウアイxウである。 aaを自然数とする。,<label>(2)</label><instruction> 不等式 <lText id=...
2,2,MATH_IA_EQ,不等式2⁢x+1≦a2x1a① -エ-aオ≦x≦-エ+aオエaオxエaオである。,<label>(3)</label><instruction> 不等式<ref target...
3,3,MATH_IA_EQ,kkを定数とする。自然数mm，nnに関する条件pp，qq，rrを次のように定める。pp:m>...,<label>(1)</label><instruction> 次の<ref target=...
4,4,MATH_IA_EQ,次のクに当てはまるものを，下の⓪～➂のうちから一つ選べ。 ppの否定p¯pはクである。,<label>(2)</label><instruction> 次の<ref target=...
5,5,MATH_IA_PARABOLA,次のケ～サに当てはまるものを，下の⓪～➂のうちから一つずつ選べ。ただし，同じものを繰り返し...,<label>【２】</label><info>（配点　25）</info> <instr...
...,...,...,...,...
393,393,MATH_IIB_VECTOR,三つのベクトル，，について …………………………① …………………………②ア，イに当てはまる...,<label>(2)</label> <instruction><formula />...
394,394,MATH_IIB_VECTOR,により，三角形ABCは正三角形である。以下，4点A，B，C，Dが，正四面体の四つの頂点になる...,<label>(3)</label> <instruction><formula>(x...
395,395,"IC_O,IC_T,MATH_IIB_STATISTICS",0＜p＜1とする。袋の中に白球がp，赤球が1-pの割合で，全部でm個入っているものとする...,<label>(1)</label> <instruction><formula />...
396,396,"IC_O,IC_T,MATH_IIB_STATISTICS",とする。この袋の中から1個の球を取り出し袋の中へ戻すという試行を4回繰り返すとき，白球の出る...,<label>(2)</label> <instruction><formula>m=...


### Tokenize
SentencePiece を使用。
- タグ あり／なし

In [7]:
m_dir = 'model/SentencePiece'
os.makedirs(m_dir, exist_ok=True)
df.to_csv(f'{m_dir}/tmp.txt', sep='\t')

# arg_str = '--input={m_dir}/tmp.txt --model_prefix={m_dir}/m_user ' + '--user_defined_symbols=<sep>,<cls>' + ',<ansColumn/>,<label>' + ' --vocab_size=2000'
# spm.SentencePieceTrainer.train(arg_str)

spm.SentencePieceTrainer.train(f'--input={m_dir}/tmp.txt --model_prefix={m_dir}/m  --user_defined_symbols=<sep>,<cls>,<pad>   --vocab_size=2000')
sp = spm.SentencePieceProcessor()  # model_file='SentencePiece/test_model.model'

sp.load(f'{m_dir}/m.model')

True

In [8]:
# # encode: text => id
# tokenized_tokens =  sp.encode_as_pieces('次の問い(問１～３)の会話の 17 ～ 19 に入れるのに最も適当なものを，それぞれ以下の①～④のうちから一つずつ選べ。	')
# print(tokenized_tokens)

# tokenized_ids = sp.encode_as_ids('次の問い(問１～３)の会話の 17 ～ 19 に入れるのに最も適当なものを，それぞれ以下の①～④のうちから一つずつ選べ。	')
# print(tokenized_ids)

# decoded_text = sp.decode(tokenized_ids)
# print(decoded_text)

In [9]:
# example_content = df_tmp['contents'][20]
# print(example_content, sp.encode_as_pieces(example_content))

In [10]:
# for index in encoded_string:
#   print('{} ----> {}'.format(index, encoder.decode([index])))

## Train 用データの準備

In [11]:
word2index = {}
# 系列を揃えるためのパディング文字列<pad>を追加
# パディング文字列のIDは0とする
word2index.update({"<pad>":0})

for inst, cont in zip(df['<instruction/>'], df['contents']):
#     try:
    tokens = sp.encode_as_pieces(inst + cont)
    for word in tokens:
            if word in word2index: continue
            word2index[word] = len(word2index)
#     except TypeError:
#         print(f'[Error] <instruction/> が nan です。')
#         print(f'    inst : {inst}')
#         print(f'    cont : {cont}')

print("vocab size : ", len(word2index))


vocab size :  1969


In [12]:
## set_dict から自動抽出する！
# attr_name = 'knowledge_type'  # 'answer_type'

categories = set()
for sbj in sbj_names:
    with open(f'../class_set/{sbj}-{elmnt_name}-{attr_name}.json') as f:
        categories |= set(json.load(f))   # sbj_names = ['Eigo', ]

# print(categories)

# categories = [
#     'sentence', 
#     'term_person', 'term_location', 'term_time', 'term_other',
#     'referenceSymbol',
#     'image_graph', 'image_photo', 'image_map', 'image_table', 'image_other',
#     'formula', 
#     'orthography',
#     'other',
#     # 組み合わせ系（仮追加）
#     '(symbol-sentence)*2', '(symbol-sentence)*3', '(symbol-sentence)*4', '(symbol-term_location)*3', '(symbol-term_other)*3', '(symbol-term_other)*3',
#     '(symbol-symbol)*4',
#     '(term_location-term_location-term_location)', 'term_location-term_location-term_location-term_location',
#     '(term_location-term_location-term_location)', '(term_location-term_location-term_location)',
#     '(term_other-term_other-term_other)',
#     'term_other-term_other-term_other',
#     'sentence-sentence',
#     'sentence-sentence-sentence',
#     'symbol-symbol',
#     'symbol-symbol-symbol',
#     'symbol-symbol-symbol-symbol',
#     'o(symbol-symbol-symbol-symbol)',
#     'o(symbol-symbol-symbol)',
# ]

categories = list(categories)
categories.sort()    # 入れないと、クラス番号が変わってしまい、再現実験ができないので注意？
print(categories)
print(len(categories))

['IC_G,IC_O,MATH_IIB_STATISTICS', 'IC_G,MATH_IIB_STATISTICS', 'IC_O,IC_T,MATH_IIB_STATISTICS', 'IC_O,MATH_IA_PROBABILITY', 'IC_O,MATH_IIB_VECTOR', 'IC_T,IC_G,MATH_IIB_STATISTICS', 'IC_T,IC_O,MATH_IIB_STATISTICS', 'IC_T,MATH_IIB_COMPUTER', 'IC_T,MATH_IIB_STATISTICS', 'MATH_IA_EQ', 'MATH_IA_GEOMETRY', 'MATH_IA_PARABOLA', 'MATH_IA_PROBABILITY', 'MATH_IA_PROBABILITY,IC_O', 'MATH_IA_SET_LOGIC', 'MATH_IA_SET_LOGIC,IC_G', 'MATH_IA_SET_LOGIC,IC_G,IC_O', 'MATH_IA_SET_LOGIC,IC_G,IC_T', 'MATH_IA_SET_LOGIC,IC_T', 'MATH_IIB_CALCULUS', 'MATH_IIB_COMPUTER', 'MATH_IIB_EXPLOG', 'MATH_IIB_GEOMETRY', 'MATH_IIB_NUMSEQ', 'MATH_IIB_POLY', 'MATH_IIB_POLY,MATH_IIB_EXPLOG', 'MATH_IIB_STATISTICS', 'MATH_IIB_STATISTICS,IC_G,IC_T', 'MATH_IIB_TRIGONOMETRY', 'MATH_IIB_VECTOR']
30


In [13]:
## 系列の長さを揃えてバッチでまとめる

from sklearn.model_selection import train_test_split
import random
from sklearn.utils import shuffle

cat2index = {}
for cat in categories:
    if cat in cat2index: continue
    cat2index[cat] = len(cat2index)

def sentence2index(sentence):
    tokens = sp.encode_as_pieces(sentence)
    # print(tokens)
    return [word2index[w] for w in tokens]

def category2index(cat):
    return [cat2index[cat]]

index_datasets_c_xml_tmp = []
index_datasets_category = []

# 系列の長さの最大値を取得。この長さに他の系列の長さをあわせる
max_len = 0
for inst, cont, category in zip(df['<instruction/>'], df['contents'], df[attr_name]):
    index_c_xml = sentence2index(inst + cont)
    index_category = category2index(category)
    index_datasets_c_xml_tmp.append(index_c_xml)
    index_datasets_category.append(index_category)
    if max_len < len(index_c_xml):
        max_len = len(index_c_xml)

# 系列の長さを揃えるために短い系列にパディングを追加
# 後ろパディングだと正しく学習できなかったので、前パディング
index_datasets_c_xml = []
for c_xml in index_datasets_c_xml_tmp:
    for i in range(max_len - len(c_xml)):
        c_xml.insert(0, 0) # 前パディング
#     c_xml.append(0)　# 後ろパディング
    index_datasets_c_xml.append(c_xml)

train_x, test_x, train_y, test_y = train_test_split(index_datasets_c_xml, index_datasets_category, train_size=0.7)

# データをバッチでまとめるための関数
def train2batch(c_xml, category, batch_size=100):
    c_xml_batch = []
    category_batch = []
    c_xml_shuffle, category_shuffle = shuffle(c_xml, category)
    for i in range(0, len(c_xml), batch_size):
        c_xml_batch.append(c_xml_shuffle[i:i+batch_size])
        category_batch.append(category_shuffle[i:i+batch_size])
    return c_xml_batch, category_batch

## モデルの作成

In [14]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        # <pad>の単語IDが0なので、padding_idx=0としている
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # batch_first=Trueが大事！
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.softmax = nn.LogSoftmax()

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        #embeds.size() = (batch_size × len(sentence) × embedding_dim)
        _, lstm_out = self.lstm(embeds)
        # lstm_out[0].size() = (1 × batch_size × hidden_dim)
        tag_space = self.hidden2tag(lstm_out[0])
        # tag_space.size() = (1 × batch_size × tagset_size)

        # (batch_size × tagset_size)にするためにsqueeze()する
        tag_scores = self.softmax(tag_space.squeeze())
        # tag_scores.size() = (batch_size × tagset_size)

        return tag_scores  

    def load_weights(self, load_m_path='_logs/test/LSTM_classifier.pth',):
        if load_m_path is not None:
            param = torch.load(load_m_path)
            self.load_state_dict(param)
            print(f'[info] {load_m_path} loaded !')

    def save(self, save_f_path='_logs/test/LSTM_classifier.pth',):
        torch.save(self.state_dict(), save_f_path)

    
# 単語の埋め込み次元数上げた。精度がそこそこアップ！ハイパーパラメータのチューニング大事。
EMBEDDING_DIM = 200
HIDDEN_DIM = 128
VOCAB_SIZE = len(word2index)
TAG_SIZE = len(categories)

# model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE).to(device)

# loss_function = nn.NLLLoss()
# # SGDからAdamに変更。特に意味はなし
# optimizer = optim.Adam(model.parameters(), lr=0.001)


---

## Eval

In [15]:
option = ''

# 重み読み込み
best_m_dir = "best-kwlg_type_sbj_math" #@param ['best-kwlg_type_sbj_en',  'best-kwlg_type_sbj_math', 'best-kwlg_type_sbj_en_noXML', 'best-kwlg_type_sbj_x2']
# option = '_500epc'

model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE)
model.load_weights(f'_logs/{best_m_dir}/LSTM_classifier{option}.pth')
# to(device)でモデルがGPU対応する
model.to(device)

[info] _logs/best-kwlg_type_sbj_math/LSTM_classifier.pth loaded !


LSTMClassifier(
  (word_embeddings): Embedding(1969, 200, padding_idx=0)
  (lstm): LSTM(200, 128, batch_first=True)
  (hidden2tag): Linear(in_features=128, out_features=30, bias=True)
  (softmax): LogSoftmax(dim=None)
)

In [17]:
test_num = len(test_x)
a = 0
with torch.no_grad():
    title_batch, category_batch = train2batch(test_x, test_y)

    for i in range(len(title_batch)):
        title_tensor = torch.tensor(title_batch[i], device=device)
        category_tensor = torch.tensor(category_batch[i], device=device)

        out = model(title_tensor)
        _, predicts = torch.max(out, 1)
        for j, ans in enumerate(category_tensor):
            if predicts[j].item() == ans.item():
                a += 1
                print(predicts[j].item(), ans.item())
            # else:
            #     print(predicts[j].item(), ans.item())
print("predict : ", a / test_num)
# predict :  0.6967916854948034

predict :  0.0




___