# RNN を使ったテキスト分類

In [None]:
# !pip install -q tf-nightly
# import tensorflow_datasets as tfds
# !pip3 list | grep tensorflow
# import tensorflow as tf

# !pip3 install sentencepiece
import sentencepiece as spm

import json
import os
import pandas as pd

## Data


### load

In [None]:
sbj_names = [
    # ['Eigo']
    ['Suugaku']
    # ['Eigo', 'Suugaku']
][0]
elmnt_name = 'question'
attr_name =  'answer_type'   # 'knowledge_type'  #
is_remove_xml = False

df = None
for sbj in sbj_names:
    print(sbj)
    attr_csv_path = f'../{sbj}_{attr_name}_ds.tsv'
    df_tmp = pd.read_csv(attr_csv_path, delimiter='\t')
    df = pd.concat([df, df_tmp])

df = df.reset_index(drop=True)
df = df.dropna()  # nan を削除

In [None]:
## XMLタグ除去
if is_remove_xml:
    import re
    def remove_xml(xml_child_str):
        xml_child_str = re.sub('<.*?>', '', xml_child_str)
        return re.sub('</.*?>', '', xml_child_str)

    # test_idx = 200
    # print(df['contents'][test_idx])
    # print(remove_xml(df['contents'][test_idx]))
    df['contents'] = df['contents'].map(remove_xml)
df

### Tokenize


In [None]:
m_dir = '_logs/SentencePiece'
os.makedirs(m_dir, exist_ok=True)
df.to_csv(f'{m_dir}/tmp.txt', sep='\t')

# arg_str = '--input={m_dir}/tmp.txt --model_prefix={m_dir}/m_user ' + '--user_defined_symbols=<sep>,<cls>' + ',<ansColumn/>,<label>' + ' --vocab_size=2000'
# spm.SentencePieceTrainer.train(arg_str)

spm.SentencePieceTrainer.train(f'--input={m_dir}/tmp.txt --model_prefix={m_dir}/m  --user_defined_symbols=<sep>,<cls>,<pad>   --vocab_size=2000')
sp = spm.SentencePieceProcessor()  # model_file='SentencePiece/test_model.model'

sp.load(f'{m_dir}/m.model')

In [None]:
# # encode: text => id
# tokenized_tokens =  sp.encode_as_pieces('次の問い(問１～３)の会話の 17 ～ 19 に入れるのに最も適当なものを，それぞれ以下の①～④のうちから一つずつ選べ。	')
# print(tokenized_tokens)

# tokenized_ids = sp.encode_as_ids('次の問い(問１～３)の会話の 17 ～ 19 に入れるのに最も適当なものを，それぞれ以下の①～④のうちから一つずつ選べ。	')
# print(tokenized_ids)

# decoded_text = sp.decode(tokenized_ids)
# print(decoded_text)

In [None]:
# example_content = df_tmp['contents'][20]
# print(example_content, sp.encode_as_pieces(example_content))

# for index in encoded_string:
#   print('{} ----> {}'.format(index, encoder.decode([index])))

## Train 用データの準備

In [None]:
word2index = {}
# 系列を揃えるためのパディング文字列<pad>を追加
# パディング文字列のIDは0とする
word2index.update({"<pad>":0})

for inst, cont in zip(df['<instruction/>'], df['contents']):
#     try:
    tokens = sp.encode_as_pieces(inst + cont)
    for word in tokens:
            if word in word2index: continue
            word2index[word] = len(word2index)
#     except TypeError:
#         print(f'[Error] <instruction/> が nan です。')
#         print(f'    inst : {inst}')
#         print(f'    cont : {cont}')

print("vocab size : ", len(word2index))


In [None]:
## set_dict から自動抽出する！
categories = set()
for sbj in sbj_names:
    with open(f'../class_set/{sbj}-{elmnt_name}-{attr_name}.json') as f:
        categories |= set(json.load(f))   # sbj_names = ['Eigo', ]

categories = list(categories)
categories.sort()    # 入れないと、クラス番号が変わってしまい、再現実験ができないので注意？
print(categories)
print(len(categories))

In [None]:
## 系列の長さを揃えてバッチでまとめる
from sklearn.model_selection import train_test_split
import numpy as np
import random
from tqdm import tqdm

cat2index = {}
for cat in categories:
    if cat in cat2index: continue
    cat2index[cat] = len(cat2index)

def sentence2index(sentence):
    tokens = sp.encode_as_pieces(sentence)
    # print(tokens)
    return [word2index[w] for w in tokens]

def category2index(cat):
    return cat2index[cat]

index_datasets_c_xml_tmp = []
index_datasets_category = []

# 系列の長さの最大値を取得。この長さに他の系列の長さをあわせる
max_len = 0
for inst, cont, category in tqdm(zip(df['<instruction/>'], df['contents'], df[attr_name])):
    index_c_xml = sentence2index(inst + cont)
    index_category = category2index(category)
    index_datasets_c_xml_tmp.append(index_c_xml)
    index_datasets_category.append(index_category)
    if max_len < len(index_c_xml):
        max_len = len(index_c_xml)
        # if max_len > 10000:
        #     print(inst, cont)

# 系列の長さを揃えるために短い系列にパディングを追加
index_datasets_c_xml = []
for c_xml in tqdm(index_datasets_c_xml_tmp):
    # パディング作成
    padd = [0] * (max_len - len(c_xml))
    # 後ろパディングだと正しく学習できなかったので、前パディング
    c_xml = padd + c_xml # 前パディング
    # c_xml = c_xml + padd # 後ろパディング
#     print(len(c_xml))
    index_datasets_c_xml.append(c_xml)

x_train, x_valid, y_train, y_valid = train_test_split(index_datasets_c_xml, index_datasets_category, train_size=0.7)
x_train = np.array(x_train)
y_train = np.array(y_train)
x_valid = np.array(x_valid)
y_valid = np.array(y_valid)
# print(x_train[:5])

In [None]:
from sklearn.preprocessing import StandardScaler
import torch

# 特徴量の標準化
# scaler = StandardScaler()
# scaler.fit(x_train)
# x_train = scaler.transform(x_train)
# x_valid = scaler.transform(x_valid)

# Tensor型に変換
# 学習に入れるときはfloat型 or long型になっている必要があるのここで変換してしまう
x_train = torch.from_numpy(x_train)
y_train = torch.from_numpy(y_train)
x_valid = torch.from_numpy(x_valid)
y_valid = torch.from_numpy(y_valid)

print('x_train : ', x_train.shape)
print('y_train : ', y_train.shape)
print('x_valid : ', x_valid.shape)
print('y_valid : ', y_valid.shape)
print(x_train[:5])

In [None]:
###  Dataset  ###
from torch.utils.data import TensorDataset

train_dataset = TensorDataset(x_train, y_train)
valid_dataset = TensorDataset(x_valid, y_valid)

# 動作確認
# indexを指定すればデータを取り出すことができます。
index = 0
print(train_dataset.__getitem__(index)[0].size())
print(train_dataset.__getitem__(index)[1])

## Model

In [None]:
import torch
# import torch.nn.functional as F
import torch.optim as optim

%load_ext autoreload
from model_abc.LSTM_text_classify_model import (
    LSTM_TextClassifier_ptModel
)
%autoreload

# GPUを使うために必要
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# モデルのハイパーパラメータ
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
VOCAB_SIZE = len(word2index)
TAG_SIZE = len(categories)

## モデルの保存場所を準備する。
import datetime
dt_now = datetime.datetime.now()
save_m_dir = os.path.join('_logs', dt_now.strftime('%m%d_%Hh%Mm%Ss'))

model = LSTM_TextClassifier_ptModel(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAG_SIZE,
                                        save_m_dir, save_m_file='LSTM_classifier_.pth').to(device)

## Experiment Train

In [None]:
import torch.nn as nn
from base_ExpTrain import Batch_ExpTrain

# 実験設定
lr=0.001
epochs=1500
batch_size=50
early_stopping=800

# exec
exp_batch_train = Batch_ExpTrain(train_dataset, valid_dataset, device)
criterion = nn.NLLLoss()  # ignore_index=PAD_token
optimizer = optim.Adam(model.parameters(), lr=lr)
max_valid_acc = exp_batch_train.exec(
                            model, criterion, optimizer,
                            epochs=epochs, batch_size=batch_size, early_stopping=early_stopping )
                            # teacher_forcing=0.5, early_stopping=5)
print(max_valid_acc)
print("done.")

In [None]:
## 実験パラメータのメモを保存
import json

with open(os.path.join(save_m_dir, 'model_params.json'), 'w') as param_f:
    json.dump({
        'Model' : {
            'EMBEDDING_DIM': EMBEDDING_DIM,
            'HIDDEN_DIM' : HIDDEN_DIM,
            'VOCAB_SIZE' : VOCAB_SIZE,
            'TAG_SIZE':TAG_SIZE,
        },
        'Data' : {
            'クラス数' : len(categories),
            "vocab size" : len(word2index),
            'max_len' : max_len,
            'train' : {
                'データ数' : len(train_dataset),
            },
            'valid' : {
                'データ数' : len(valid_dataset),
            }
        },
        'Experiment' : {
            'XML条件' : {
                'sbj_names': sbj_names,
                'elmnt_name': elmnt_name,
                'attr_name': attr_name,
                'is_remove_xml' : is_remove_xml
            },
           '実験設定' : {
                'lr':lr,
                'epochs':epochs,
                'batch_size':batch_size,
                'early_stopping':early_stopping
            },
            '実験結果' : {
                'max_valid_acc': max_valid_acc,
                'valid負例x100' : []
            }
        },
    }, param_f)

___

## Experiment Acc 計算

In [None]:
from torch.utils.data import DataLoader

valid_acc = 0
total_count = 0
valid_dataloader = DataLoader(valid_dataset, batch_size=10, shuffle=False)

with torch.no_grad():
    for idx, (X_batch, Y_batch) in enumerate(valid_dataloader):
        valid_loss = 0
        valid_loss, pred_batch_arr = model.predict(X_batch, Y_batch, criterion, device)
        # acc を計算する。
        _, pred_batch = torch.max(pred_batch_arr, 1)
        # acc を計算する。
        for j, ans in enumerate(Y_batch):
            # print(pred_batch[j].item(), ans.item())
            if pred_batch[j].item() == ans.item():
                valid_acc += 1
#             else:
#                 print(predicts[j].item(), ans.item())
        total_count += Y_batch.size(0)
    valid_acc /= total_count

print("[Info] acc : ", valid_acc, "loss : ", valid_loss)